diff --git a/.config/nextest.toml b/.config/nextest.toml index 8bccd51c6d..affdc16f31 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -1,2 +1,2 @@ [profile.default] -slow-timeout = "1m" +slow-timeout = { period = "60s", terminate-after = 3 } diff --git a/.dockerignore b/.dockerignore index ae0ad8fd77..c7a2f78e32 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,27 +1,30 @@ * -!rust-toolchain.toml -!Cargo.toml +# Files !Cargo.lock +!Cargo.toml !Makefile +!rust-toolchain.toml +!scripts/combine_control_files.py +!scripts/ninstall.sh +!vm-cgconfig.conf +!docker-compose/run-tests.sh +# Directories !.cargo/ !.config/ -!control_plane/ !compute_tools/ +!control_plane/ !libs/ +!neon_local/ !pageserver/ +!patches/ !pgxn/ !proxy/ +!storage_scrubber/ !safekeeper/ -!s3_scrubber/ !storage_broker/ +!storage_controller/ !trace/ -!vendor/postgres-v14/ -!vendor/postgres-v15/ -!vendor/postgres-v16/ +!vendor/postgres-*/ !workspace_hack/ -!neon_local/ -!scripts/ninstall.sh -!scripts/combine_control_files.py -!vm-cgconfig.conf diff --git a/.github/ISSUE_TEMPLATE/epic-template.md b/.github/ISSUE_TEMPLATE/epic-template.md index 019e6e7345..c442f50fde 100644 --- a/.github/ISSUE_TEMPLATE/epic-template.md +++ b/.github/ISSUE_TEMPLATE/epic-template.md @@ -16,9 +16,9 @@ assignees: '' ## Implementation ideas - +## Tasks ```[tasklist] -### Tasks +- [ ] Example Task ``` diff --git a/.github/actionlint.yml b/.github/actionlint.yml index 362480f256..37983798b7 100644 --- a/.github/actionlint.yml +++ b/.github/actionlint.yml @@ -1,10 +1,11 @@ self-hosted-runner: labels: - arm64 - - dev - gen3 - large + - large-arm64 - small + - small-arm64 - us-east-2 config-variables: - REMOTE_STORAGE_AZURE_CONTAINER diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index abdbba802e..f84beff20c 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -39,7 +39,7 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else @@ -59,7 +59,7 @@ runs: BUCKET: neon-github-public-dev # TODO: We can replace with a special docker image with Java and Allure pre-installed - - uses: actions/setup-java@v3 + - uses: actions/setup-java@v4 with: distribution: 'temurin' java-version: '17' @@ -76,8 +76,8 @@ runs: rm -f ${ALLURE_ZIP} fi env: - ALLURE_VERSION: 2.24.0 - ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90 + ALLURE_VERSION: 2.27.0 + ALLURE_ZIP_SHA256: b071858fb2fa542c65d8f152c5c40d26267b2dfb74df1f1608a589ecca38e777 # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this - name: Acquire lock @@ -150,7 +150,7 @@ runs: # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work, # and to keep files on the host to upload them to the database - time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}" + time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/" # Generate redirect cat < ${WORKDIR}/index.html @@ -179,22 +179,11 @@ runs: aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" fi - - name: Store Allure test stat in the DB - if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} - shell: bash -euxo pipefail {0} - env: - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }} - run: | - export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR} - - ./scripts/pysync - - poetry run python3 scripts/ingest_regress_test_result.py \ - --revision ${COMMIT_SHA} \ - --reference ${GITHUB_REF} \ - --build-type unified \ - --ingest ${WORKDIR}/report/data/suites.json + - name: Cache poetry deps + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} @@ -226,7 +215,7 @@ runs: rm -rf ${WORKDIR} fi - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 if: always() env: REPORT_URL: ${{ steps.generate-report.outputs.report-url }} diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 7ae9937d42..df4a6712ac 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -19,7 +19,7 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml index f1eea34ab9..9f752d5a89 100644 --- a/.github/actions/neon-branch-create/action.yml +++ b/.github/actions/neon-branch-create/action.yml @@ -3,14 +3,14 @@ description: 'Create Branch using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project to create Branch in' + description: 'ID of the Project to create Branch in' required: true api_host: - desctiption: 'Neon API host' - default: console.stage.neon.tech + description: 'Neon API host' + default: console-stage.neon.build outputs: dsn: description: 'Created Branch DSN (for main database)' diff --git a/.github/actions/neon-branch-delete/action.yml b/.github/actions/neon-branch-delete/action.yml index f8cd351dd9..58141a4a3f 100644 --- a/.github/actions/neon-branch-delete/action.yml +++ b/.github/actions/neon-branch-delete/action.yml @@ -3,17 +3,17 @@ description: 'Delete Branch using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project which should be deleted' + description: 'ID of the Project which should be deleted' required: true branch_id: - desctiption: 'ID of the branch to delete' + description: 'ID of the branch to delete' required: true api_host: - desctiption: 'Neon API host' - default: console.stage.neon.tech + description: 'Neon API host' + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index ae6464990e..16759ad038 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -3,22 +3,22 @@ description: 'Create Neon Project using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true region_id: - desctiption: 'Region ID, if not set the project will be created in the default region' + description: 'Region ID, if not set the project will be created in the default region' default: aws-us-east-2 postgres_version: - desctiption: 'Postgres version; default is 15' - default: 15 + description: 'Postgres version; default is 15' + default: '15' api_host: - desctiption: 'Neon API host' - default: console.stage.neon.tech + description: 'Neon API host' + default: console-stage.neon.build provisioner: - desctiption: 'k8s-pod or k8s-neonvm' + description: 'k8s-pod or k8s-neonvm' default: 'k8s-pod' compute_units: - desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' + description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' default: '[1, 1]' outputs: diff --git a/.github/actions/neon-project-delete/action.yml b/.github/actions/neon-project-delete/action.yml index adc8510a34..35e165fd61 100644 --- a/.github/actions/neon-project-delete/action.yml +++ b/.github/actions/neon-project-delete/action.yml @@ -3,14 +3,14 @@ description: 'Delete Neon Project using API' inputs: api_key: - desctiption: 'Neon API key' + description: 'Neon API key' required: true project_id: - desctiption: 'ID of the Project to delete' + description: 'ID of the Project to delete' required: true api_host: - desctiption: 'Neon API host' - default: console.stage.neon.tech + description: 'Neon API host' + default: console-stage.neon.build runs: using: "composite" diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 8dfa6c465f..d9e543d4bb 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -44,6 +44,10 @@ inputs: description: 'Postgres version to use for tests' required: false default: 'v14' + benchmark_durations: + description: 'benchmark durations JSON' + required: false + default: '{}' runs: using: "composite" @@ -76,17 +80,16 @@ runs: - name: Checkout if: inputs.needs_postgres_source == 'true' - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} @@ -160,7 +163,7 @@ runs: # We use pytest-split plugin to run benchmarks in parallel on different CI runners if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then mkdir -p $TEST_OUTPUT - poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json" + echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS" fi diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 584828c1d0..078c7f88c4 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -16,8 +16,15 @@ concurrency: cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name}} + actionlint: - runs-on: ubuntu-latest + needs: [ check-permissions ] + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 - uses: reviewdog/action-actionlint@v1 @@ -29,3 +36,15 @@ jobs: fail_on_error: true filter_mode: nofilter level: error + - run: | + PAT='^\s*runs-on:.*-latest' + if grep -ERq $PAT .github/workflows + then + grep -ERl $PAT .github/workflows |\ + while read -r f + do + l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1) + echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead." + done + exit 1 + fi diff --git a/.github/workflows/approved-for-ci-run.yml b/.github/workflows/approved-for-ci-run.yml index 5b21011b83..0a0898d30c 100644 --- a/.github/workflows/approved-for-ci-run.yml +++ b/.github/workflows/approved-for-ci-run.yml @@ -18,6 +18,7 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: false env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -43,7 +44,7 @@ jobs: contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) && contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" @@ -59,24 +60,50 @@ jobs: github.event.action == 'labeled' && contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run') - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - run: gh pr --repo "${GITHUB_REPOSITORY}" edit "${PR_NUMBER}" --remove-label "approved-for-ci-run" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: main token: ${{ secrets.CI_ACCESS_TOKEN }} + + - name: Look for existing PR + id: get-pr + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')" + echo "ALREADY_CREATED=${ALREADY_CREATED}" >> ${GITHUB_OUTPUT} + + - name: Get changed labels + id: get-labels + if: steps.get-pr.outputs.ALREADY_CREATED != '' + env: + ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + LABELS_TO_REMOVE=$(comm -23 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) | sort) \ + <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\ + ( grep -v run-e2e-tests-in-draft || true ) | paste -sd , -) + LABELS_TO_ADD=$(comm -13 <(gh pr --repo ${GITHUB_REPOSITORY} view ${ALREADY_CREATED} --json labels --jq '.labels.[].name'| ( grep -E '^run' || true ) |sort) \ + <(gh pr --repo ${GITHUB_REPOSITORY} view ${PR_NUMBER} --json labels --jq '.labels.[].name' | ( grep -E '^run' || true ) | sort ) |\ + paste -sd , -) + echo "LABELS_TO_ADD=${LABELS_TO_ADD}" >> ${GITHUB_OUTPUT} + echo "LABELS_TO_REMOVE=${LABELS_TO_REMOVE}" >> ${GITHUB_OUTPUT} - run: gh pr checkout "${PR_NUMBER}" - run: git checkout -b "${BRANCH}" - run: git push --force origin "${BRANCH}" + if: steps.get-pr.outputs.ALREADY_CREATED == '' - name: Create a Pull Request for CI run (if required) - env: + if: steps.get-pr.outputs.ALREADY_CREATED == '' + env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | cat << EOF > body.md @@ -87,15 +114,33 @@ jobs: Feel free to review/comment/discuss the original PR #${PR_NUMBER}. EOF - ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')" - if [ -z "${ALREADY_CREATED}" ]; then - gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \ + LABELS=$( (gh pr --repo "${GITHUB_REPOSITORY}" view ${PR_NUMBER} --json labels --jq '.labels.[].name'; echo run-e2e-tests-in-draft )| \ + grep -E '^run' | paste -sd , -) + gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \ --body-file "body.md" \ --head "${BRANCH}" \ --base "main" \ + --label ${LABELS} \ --draft + - name: Modify the existing pull request (if required) + if: steps.get-pr.outputs.ALREADY_CREATED != '' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + LABELS_TO_ADD: ${{ steps.get-labels.outputs.LABELS_TO_ADD }} + LABELS_TO_REMOVE: ${{ steps.get-labels.outputs.LABELS_TO_REMOVE }} + ALREADY_CREATED: ${{ steps.get-pr.outputs.ALREADY_CREATED }} + run: | + ADD_CMD= + REMOVE_CMD= + [ -z "${LABELS_TO_ADD}" ] || ADD_CMD="--add-label ${LABELS_TO_ADD}" + [ -z "${LABELS_TO_REMOVE}" ] || REMOVE_CMD="--remove-label ${LABELS_TO_REMOVE}" + if [ -n "${ADD_CMD}" ] || [ -n "${REMOVE_CMD}" ]; then + gh pr --repo "${GITHUB_REPOSITORY}" edit ${ALREADY_CREATED} ${ADD_CMD} ${REMOVE_CMD} fi + - run: git push --force origin "${BRANCH}" + if: steps.get-pr.outputs.ALREADY_CREATED != '' + cleanup: # Close PRs and delete branchs if the original PR is closed. @@ -107,7 +152,7 @@ jobs: github.event.action == 'closed' && github.event.pull_request.head.repo.full_name != github.repository - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 8bf12c31b1..9eff483680 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -38,6 +38,11 @@ on: description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch' required: false default: false + run_only_pgvector_tests: + type: boolean + description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run' + required: false + default: false defaults: run: @@ -50,6 +55,7 @@ concurrency: jobs: bench: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" @@ -62,11 +68,11 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download @@ -93,7 +99,7 @@ jobs: # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests - extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py + extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -120,6 +126,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} generate-matrices: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) # # Available platforms: @@ -130,7 +137,7 @@ jobs: # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage env: RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }} - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 outputs: pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }} @@ -147,15 +154,16 @@ jobs: "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "platform": "neon-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "platform": "neonvm-captest-new", "db_size": "50gb" }] + "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, + { "platform": "neon-captest-new", "db_size": "50gb" }, + { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, + { "platform": "neonvm-captest-new", "db_size": "50gb" }, + { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] }' if [ "$(date +%A)" = "Saturday" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, - { "platform": "rds-aurora", "db_size": "50gb"}]') + { "platform": "rds-aurora", "db_size": "50gb"}]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -171,7 +179,7 @@ jobs: if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, - { "platform": "rds-aurora" }]') + { "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -190,12 +198,13 @@ jobs: if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, - { "platform": "rds-aurora", "scale": "10" }]') + { "platform": "rds-aurora", "scale": "10" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT pgbench-compare: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} needs: [ generate-matrices ] strategy: @@ -214,14 +223,14 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init # Increase timeout to 8h, default timeout is 6h timeout-minutes: 480 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download @@ -253,6 +262,9 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; + neonvm-captest-sharding-reuse) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} + ;; neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; @@ -270,11 +282,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Benchmark init uses: ./.github/actions/run-python-test-set @@ -335,6 +351,92 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + pgbench-pgvector: + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "15m" + TEST_PG_BENCH_SCALES_MATRIX: "1" + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 16 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-captest-pgvector" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Add Postgres binaries to PATH + run: | + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version + echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH + + - name: Set up Connection String + id: set-up-connstr + run: | + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + QUERIES=("SELECT version()") + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done + + - name: Benchmark pgvector hnsw indexing + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_perf_olap.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + + - name: Benchmark pgvector queries + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_perf_pgvector_queries.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Create Allure report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + clickbench-compare: # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters # we use for performance testing in pgbench-compare. @@ -343,7 +445,7 @@ jobs: # # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB - if: ${{ !cancelled() }} + if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} needs: [ generate-matrices, pgbench-compare ] strategy: @@ -362,11 +464,11 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download @@ -401,11 +503,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set @@ -443,7 +549,7 @@ jobs: # We might change it after https://github.com/neondatabase/neon/issues/2900. # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) - if: ${{ !cancelled() }} + if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} needs: [ generate-matrices, clickbench-compare ] strategy: @@ -461,11 +567,11 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download @@ -507,11 +613,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set @@ -541,7 +651,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: - if: ${{ !cancelled() }} + if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} needs: [ generate-matrices, tpch-compare ] strategy: @@ -558,11 +668,11 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download Neon artifact uses: ./.github/actions/download @@ -597,11 +707,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Run user examples uses: ./.github/actions/run-python-test-set diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml new file mode 100644 index 0000000000..6e90a80ab7 --- /dev/null +++ b/.github/workflows/build-build-tools-image.yml @@ -0,0 +1,105 @@ +name: Build build-tools image + +on: + workflow_call: + inputs: + image-tag: + description: "build-tools image tag" + required: true + type: string + outputs: + image-tag: + description: "build-tools tag" + value: ${{ inputs.image-tag }} + image: + description: "build-tools image" + value: neondatabase/build-tools:${{ inputs.image-tag }} + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: build-build-tools-image-${{ inputs.image-tag }} + cancel-in-progress: false + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-image: + uses: ./.github/workflows/check-build-tools-image.yml + + build-image: + needs: [ check-image ] + if: needs.check-image.outputs.found == 'false' + + strategy: + matrix: + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + + env: + IMAGE_TAG: ${{ inputs.image-tag }} + + steps: + - name: Check `input.tag` is correct + env: + INPUTS_IMAGE_TAG: ${{ inputs.image-tag }} + CHECK_IMAGE_TAG : ${{ needs.check-image.outputs.image-tag }} + run: | + if [ "${INPUTS_IMAGE_TAG}" != "${CHECK_IMAGE_TAG}" ]; then + echo "'inputs.image-tag' (${INPUTS_IMAGE_TAG}) does not match the tag of the latest build-tools image 'inputs.image-tag' (${CHECK_IMAGE_TAG})" + exit 1 + fi + + - uses: actions/checkout@v4 + + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory + run: | + mkdir -p /tmp/.docker-custom + echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV + + - uses: docker/setup-buildx-action@v2 + + - uses: docker/login-action@v2 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - uses: docker/build-push-action@v4 + with: + context: . + provenance: false + push: true + pull: true + file: Dockerfile.build-tools + cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max + tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} + + - name: Remove custom docker config directory + run: | + rm -rf /tmp/.docker-custom + + merge-images: + needs: [ build-image ] + runs-on: ubuntu-22.04 + + env: + IMAGE_TAG: ${{ inputs.image-tag }} + + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch image + run: | + docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \ + neondatabase/build-tools:${IMAGE_TAG}-x64 \ + neondatabase/build-tools:${IMAGE_TAG}-arm64 diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml deleted file mode 100644 index e401b2f418..0000000000 --- a/.github/workflows/build_and_push_docker_image.yml +++ /dev/null @@ -1,105 +0,0 @@ -name: Build and Push Docker Image - -on: - workflow_call: - inputs: - dockerfile-path: - required: true - type: string - image-name: - required: true - type: string - outputs: - build-tools-tag: - description: "tag generated for build tools" - value: ${{ jobs.tag.outputs.build-tools-tag }} - -jobs: - check-if-build-tools-dockerfile-changed: - runs-on: ubuntu-latest - outputs: - docker_file_changed: ${{ steps.dockerfile.outputs.docker_file_changed }} - steps: - - name: Check if Dockerfile.buildtools has changed - id: dockerfile - run: | - if [[ "$GITHUB_EVENT_NAME" != "pull_request" ]]; then - echo "docker_file_changed=false" >> $GITHUB_OUTPUT - exit - fi - updated_files=$(gh pr --repo neondatabase/neon diff ${{ github.event.pull_request.number }} --name-only) - if [[ $updated_files == *"Dockerfile.buildtools"* ]]; then - echo "docker_file_changed=true" >> $GITHUB_OUTPUT - fi - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - tag: - runs-on: ubuntu-latest - needs: [ check-if-build-tools-dockerfile-changed ] - outputs: - build-tools-tag: ${{steps.buildtools-tag.outputs.image_tag}} - - steps: - - name: Get buildtools tag - env: - DOCKERFILE_CHANGED: ${{ needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed }} - run: | - if [[ "$GITHUB_EVENT_NAME" == "pull_request" ]] && [[ "${DOCKERFILE_CHANGED}" == "true" ]]; then - IMAGE_TAG=$GITHUB_RUN_ID - else - IMAGE_TAG=pinned - fi - - echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT - shell: bash - id: buildtools-tag - - kaniko: - if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' - needs: [ tag, check-if-build-tools-dockerfile-changed ] - runs-on: [ self-hosted, dev, x64 ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug - - steps: - - name: Checkout - uses: actions/checkout@v1 - - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - - name: Kaniko build - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 - - kaniko-arm: - if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' - needs: [ tag, check-if-build-tools-dockerfile-changed ] - runs-on: [ self-hosted, dev, arm64 ] - container: gcr.io/kaniko-project/executor:v1.7.0-debug - - steps: - - name: Checkout - uses: actions/checkout@v1 - - - name: Configure ECR login - run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json - - - name: Kaniko build - run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --dockerfile ${{ inputs.dockerfile-path }} --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 - - manifest: - if: needs.check-if-build-tools-dockerfile-changed.outputs.docker_file_changed == 'true' - name: 'manifest' - runs-on: [ self-hosted, dev, x64 ] - needs: - - tag - - kaniko - - kaniko-arm - - check-if-build-tools-dockerfile-changed - - steps: - - name: Create manifest - run: docker manifest create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-amd64 --amend 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }}-arm64 - - - name: Push manifest - run: docker manifest push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${{ inputs.image-name }}:${{ needs.tag.outputs.build-tools-tag }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 880d6044f2..8c8500260c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -5,6 +5,7 @@ on: branches: - main - release + - release-proxy pull_request: defaults: @@ -21,28 +22,29 @@ env: COPT: '-Werror' AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix + E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} jobs: check-permissions: - runs-on: ubuntu-latest + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name}} + + cancel-previous-e2e-tests: + needs: [ check-permissions ] + if: github.event_name == 'pull_request' + runs-on: ubuntu-22.04 steps: - - name: Disallow PRs from forks - if: | - github.event_name == 'pull_request' && - github.event.pull_request.head.repo.full_name != github.repository - - run: | - if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then - MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork" - else - MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run" - fi - - echo >&2 "We don't run CI for PRs from forks" - echo >&2 "${MESSAGE}" - - exit 1 + - name: Cancel previous e2e-tests runs for this PR + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + gh workflow --repo neondatabase/cloud \ + run cancel-previous-in-concurrency-group.yml \ + --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" tag: needs: [ check-permissions ] @@ -53,7 +55,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -66,6 +68,8 @@ jobs: echo "tag=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release" ]]; then echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT @@ -73,34 +77,39 @@ jobs: shell: bash id: build-tag - build-buildtools-image: + check-build-tools-image: needs: [ check-permissions ] - uses: ./.github/workflows/build_and_push_docker_image.yml + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml with: - dockerfile-path: Dockerfile.buildtools - image-name: build-tools + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} secrets: inherit check-codestyle-python: - needs: [ check-permissions, build-buildtools-image ] + needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: false fetch-depth: 1 - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -115,15 +124,18 @@ jobs: run: poetry run mypy . check-codestyle-rust: - needs: [ check-permissions, build-buildtools-image ] - runs-on: [ self-hosted, gen3, large ] + needs: [ check-permissions, build-build-tools-image ] + runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 @@ -131,7 +143,7 @@ jobs: # Disabled for now # - name: Restore cargo deps cache # id: cache_cargo -# uses: actions/cache@v3 +# uses: actions/cache@v4 # with: # path: | # !~/.cargo/registry/src @@ -182,11 +194,18 @@ jobs: run: cargo deny check --hide-inclusion-graph build-neon: - needs: [ check-permissions, tag, build-buildtools-image ] + needs: [ check-permissions, tag, build-build-tools-image ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} - options: --init + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # Raise locked memory limit for tokio-epoll-uring. + # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), + # io_uring will account the memory of the CQ and SQ as locked. + # More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391 + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: fail-fast: false matrix: @@ -212,32 +231,11 @@ jobs: done - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 - - name: Check Postgres submodules revision - shell: bash -euo pipefail {0} - run: | - # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally). - # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603 - - FAILED=false - for postgres in postgres-v14 postgres-v15 postgres-v16; do - expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"') - actual=$(git rev-parse "HEAD:vendor/${postgres}") - if [ "${expected}" != "${actual}" ]; then - echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'" - FAILED=true - fi - done - - if [ "${FAILED}" = "true" ]; then - echo >&2 "Please update vendors/revisions.json if these changes are intentional" - exit 1 - fi - - name: Set pg 14 revision for caching id: pg_v14_rev run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT @@ -284,7 +282,7 @@ jobs: # compressed crates. # - name: Cache cargo deps # id: cache_cargo -# uses: actions/cache@v3 +# uses: actions/cache@v4 # with: # path: | # ~/.cargo/registry/ @@ -298,24 +296,24 @@ jobs: - name: Cache postgres v14 build id: cache_pg_14 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v15 build id: cache_pg_15 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v16 build id: cache_pg_16 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' @@ -339,27 +337,8 @@ jobs: run: | ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - - name: Run rust tests - run: | - ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES - - # Run separate tests for real S3 - export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty - export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests - export REMOTE_STORAGE_S3_REGION=eu-central-1 - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)' - - # Run separate tests for real Azure Blob Storage - # XXX: replace region with `eu-central-1`-like region - export ENABLE_REAL_AZURE_REMOTE_STORAGE=y - export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" - export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" - export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" - export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" - # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)' - + # Do install *before* running rust tests because they might recompile the + # binaries with different features/flags. - name: Install rust binaries run: | # Install target binaries @@ -400,6 +379,32 @@ jobs: done fi + - name: Run rust tests + env: + NEXTEST_RETRIES: 3 + run: | + #nextest does not yet support running doctests + cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + + for io_engine in std-fs tokio-epoll-uring ; do + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES + done + + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests + export REMOTE_STORAGE_S3_REGION=eu-central-1 + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)' + + # Run separate tests for real Azure Blob Storage + # XXX: replace region with `eu-central-1`-like region + export ENABLE_REAL_AZURE_REMOTE_STORAGE=y + export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}" + export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}" + export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" + export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)' + - name: Install postgres binaries run: cp -a pg_install /tmp/neon/pg_install @@ -415,12 +420,15 @@ jobs: uses: ./.github/actions/save-coverage-data regress-tests: - needs: [ check-permissions, build-neon, build-buildtools-image, tag ] + needs: [ check-permissions, build-neon, build-build-tools-image, tag ] runs-on: [ self-hosted, gen3, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} - # Default shared memory is 64mb - options: --init --shm-size=512mb + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # for changed limits, see comments on `options:` earlier in this file + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: fail-fast: false matrix: @@ -428,13 +436,14 @@ jobs: pg_version: [ v14, v15, v16 ] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 - name: Pytest regression tests uses: ./.github/actions/run-python-test-set + timeout-minutes: 60 with: build_type: ${{ matrix.build_type }} test_selection: regress @@ -448,27 +457,74 @@ jobs: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ needs.tag.outputs.build-tag }} + PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_IMPL: vectored + PAGESERVER_GET_IMPL: vectored + PAGESERVER_VALIDATE_VEC_GET: true + # Temporary disable this step until we figure out why it's so flaky + # Ref https://github.com/neondatabase/neon/issues/4540 - name: Merge and upload coverage data - if: matrix.build_type == 'debug' && matrix.pg_version == 'v14' + if: | + false && + matrix.build_type == 'debug' && matrix.pg_version == 'v14' uses: ./.github/actions/save-coverage-data - benchmarks: - needs: [ check-permissions, build-neon, build-buildtools-image ] + get-benchmarks-durations: + outputs: + json: ${{ steps.get-benchmark-durations.outputs.json }} + needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} - # Default shared memory is 64mb - options: --init --shm-size=512mb + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache poetry deps + uses: actions/cache@v4 + with: + path: ~/.cache/pypoetry/virtualenvs + key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + + - name: Install Python deps + run: ./scripts/pysync + + - name: get benchmark durations + id: get-benchmark-durations + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + run: | + poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \ + --days 10 \ + --output /tmp/benchmark_durations.json + echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT + + benchmarks: + needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ] + runs-on: [ self-hosted, gen3, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # for changed limits, see comments on `options:` earlier in this file + options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks') strategy: fail-fast: false matrix: - pytest_split_group: [ 1, 2, 3, 4 ] + # the amount of groups (N) should be reflected in `extra_params: --splits N ...` + pytest_split_group: [ 1, 2, 3, 4, 5 ] build_type: [ release ] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Pytest benchmarks uses: ./.github/actions/run-python-test-set @@ -477,25 +533,51 @@ jobs: test_selection: performance run_in_parallel: false save_perf_report: ${{ github.ref_name == 'main' }} - extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }} + extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} + benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" + PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + PAGESERVER_GET_VECTORED_IMPL: vectored + PAGESERVER_GET_IMPL: vectored + PAGESERVER_VALIDATE_VEC_GET: false # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones + report-benchmarks-failures: + needs: [ benchmarks, create-test-report ] + if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure' + runs-on: ubuntu-22.04 + + steps: + - uses: slackapi/slack-github-action@v1 + with: + channel-id: C060CNA47S9 # on-call-staging-storage-stream + slack-message: | + Benchmarks failed on main: ${{ github.event.head_commit.url }} + + Allure report: ${{ needs.create-test-report.outputs.report-url }} + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + create-test-report: - needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-buildtools-image ] + needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ] if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }} + outputs: + report-url: ${{ steps.create-allure-report.outputs.report-url }} runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Create Allure report if: ${{ !cancelled() }} @@ -504,10 +586,9 @@ jobs: with: store-test-results-into-db: true env: - REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 if: ${{ !cancelled() }} with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries @@ -533,10 +614,13 @@ jobs: }) coverage-report: - needs: [ check-permissions, regress-tests, build-buildtools-image ] + needs: [ check-permissions, regress-tests, build-build-tools-image ] runs-on: [ self-hosted, gen3, small ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }} + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init strategy: fail-fast: false @@ -547,7 +631,7 @@ jobs: coverage-json: ${{ steps.upload-coverage-report-new.outputs.summary-json }} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 @@ -582,17 +666,6 @@ jobs: --input-objects=/tmp/coverage/binaries.list \ --format=lcov - - name: Upload coverage report - id: upload-coverage-report - env: - BUCKET: neon-github-public-dev - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - run: | - aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA} - - REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html - echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - - name: Build coverage report NEW id: upload-coverage-report-new env: @@ -627,23 +700,13 @@ jobs: REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/lcov/summary.json echo "summary-json=${REPORT_URL}" >> $GITHUB_OUTPUT - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 env: - REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }} REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }} COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: script: | - const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env - - await github.rest.repos.createCommitStatus({ - owner: context.repo.owner, - repo: context.repo.repo, - sha: `${COMMIT_SHA}`, - state: 'success', - target_url: `${REPORT_URL}`, - context: 'Code coverage report', - }) + const { REPORT_URL_NEW, COMMIT_SHA } = process.env await github.rest.repos.createCommitStatus({ owner: context.repo.owner, @@ -655,205 +718,240 @@ jobs: }) trigger-e2e-tests: + if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }} needs: [ check-permissions, promote-images, tag ] - runs-on: [ self-hosted, gen3, small ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - options: --init - steps: - - name: Set PR's status to pending and request a remote CI test - run: | - # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit - # but we need to use a real sha of a latest commit in the PR's branch for the e2e job, - # to place a job run status update later. - COMMIT_SHA=${{ github.event.pull_request.head.sha }} - # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those - COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}} + uses: ./.github/workflows/trigger-e2e-tests.yml + secrets: inherit - REMOTE_REPO="${{ github.repository_owner }}/cloud" + neon-image-arch: + needs: [ check-permissions, build-build-tools-image, tag ] + strategy: + matrix: + arch: [ x64, arm64 ] - curl -f -X POST \ - https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"state\": \"pending\", - \"context\": \"neon-cloud-e2e\", - \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\" - }" - - curl -f -X POST \ - https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \ - -H "Accept: application/vnd.github.v3+json" \ - --user "${{ secrets.CI_ACCESS_TOKEN }}" \ - --data \ - "{ - \"ref\": \"main\", - \"inputs\": { - \"ci_job_name\": \"neon-cloud-e2e\", - \"commit_hash\": \"$COMMIT_SHA\", - \"remote_repo\": \"${{ github.repository }}\", - \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\", - \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\" - } - }" - - neon-image: - needs: [ check-permissions, build-buildtools-image, tag ] - runs-on: [ self-hosted, gen3, large ] - container: gcr.io/kaniko-project/executor:v1.9.2-debug - defaults: - run: - shell: sh -eu {0} + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko + uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 - - name: Configure ECR and Docker Hub login + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" + mkdir -p .docker-custom + echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + - uses: docker/setup-buildx-action@v2 - cat <<-EOF > /kaniko/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Kaniko build neon - run: - /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true - --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache - --context . - --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }} - --build-arg TAG=${{ needs.build-buildtools-image.outputs.build-tools-tag }} - --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com - --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} - --destination neondatabase/neon:${{needs.tag.outputs.build-tag}} + - uses: docker/build-push-action@v5 + with: + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile + cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }},mode=max + tags: | + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + - name: Remove custom docker config directory + if: always() + run: | + rm -rf .docker-custom - compute-tools-image: - runs-on: [ self-hosted, gen3, large ] - needs: [ check-permissions, build-buildtools-image, tag ] - container: gcr.io/kaniko-project/executor:v1.9.2-debug - defaults: - run: - shell: sh -eu {0} + neon-image: + needs: [ neon-image-arch, tag ] + runs-on: ubuntu-22.04 steps: - - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Configure ECR and Docker Hub login + - name: Create multi-arch image run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" + docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64 - cat <<-EOF > /kaniko/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Kaniko build compute tools - run: - /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true - --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache - --context . - --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} - --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} - --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com - --dockerfile Dockerfile.compute-tools - --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} - --destination neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} + - name: Push multi-arch image to ECR + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }} - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - - name: Cleanup ECR folder - run: rm -rf ~/.ecr - - compute-node-image: - needs: [ check-permissions, build-buildtools-image, tag ] - runs-on: [ self-hosted, gen3, large ] - container: - image: gcr.io/kaniko-project/executor:v1.9.2-debug - # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution."" - # Should be prevented by https://github.com/neondatabase/neon/issues/4281 - options: --add-host=download.osgeo.org:140.211.15.30 + compute-node-image-arch: + needs: [ check-permissions, build-build-tools-image, tag ] strategy: fail-fast: false matrix: version: [ v14, v15, v16 ] - defaults: - run: - shell: sh -eu {0} + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - name: Checkout - uses: actions/checkout@v1 # v3 won't work with kaniko + uses: actions/checkout@v4 with: submodules: true fetch-depth: 0 - - name: Configure ECR and Docker Hub login + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory run: | - DOCKERHUB_AUTH=$(echo -n "${{ secrets.NEON_DOCKERHUB_USERNAME }}:${{ secrets.NEON_DOCKERHUB_PASSWORD }}" | base64) - echo "::add-mask::${DOCKERHUB_AUTH}" + mkdir -p .docker-custom + echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + - uses: docker/setup-buildx-action@v2 + with: + # Disable parallelism for docker buildkit. + # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. + config-inline: | + [worker.oci] + max-parallelism = 1 - cat <<-EOF > /kaniko/.docker/config.json - { - "auths": { - "https://index.docker.io/v1/": { - "auth": "${DOCKERHUB_AUTH}" - } - }, - "credHelpers": { - "369495373322.dkr.ecr.eu-central-1.amazonaws.com": "ecr-login" - } - } - EOF + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Kaniko build compute node with extensions - run: - /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true - --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache - --context . - --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} - --build-arg PG_VERSION=${{ matrix.version }} - --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} - --build-arg TAG=${{needs.build-buildtools-image.outputs.build-tools-tag}} - --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com - --dockerfile Dockerfile.compute-node - --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - --cleanup + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + - name: Build compute-node image + uses: docker/build-push-action@v5 + with: + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + PG_VERSION=${{ matrix.version }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile.compute-node + cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max + tags: | + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + + - name: Build neon extensions test image + if: matrix.version == 'v16' + uses: docker/build-push-action@v5 + with: + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + PG_VERSION=${{ matrix.version }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile.compute-node + target: neon-pg-ext-test + cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max + tags: | + neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} + + - name: Build compute-tools image + # compute-tools are Postgres independent, so build it only once + if: matrix.version == 'v16' + uses: docker/build-push-action@v5 + with: + target: compute-tools-image + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile.compute-node + tags: | + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + + - name: Remove custom docker config directory + if: always() + run: | + rm -rf .docker-custom + + compute-node-image: + needs: [ compute-node-image-arch, tag ] + runs-on: ubuntu-22.04 + + strategy: + matrix: + version: [ v14, v15, v16 ] + + steps: + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Create multi-arch compute-node image + run: | + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + + - name: Create multi-arch neon-test-extensions image + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + + - name: Create multi-arch compute-tools image + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64 + + - uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + + - name: Push multi-arch compute-tools image to ECR + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} vm-compute-node-image: needs: [ check-permissions, tag, compute-node-image ] @@ -862,15 +960,12 @@ jobs: fail-fast: false matrix: version: [ v14, v15, v16 ] - defaults: - run: - shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.21.0 + VM_BUILDER_VERSION: v0.29.3 steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -879,30 +974,52 @@ jobs: curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder + # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings + # The default value is ~/.docker + - name: Set custom docker config directory + run: | + mkdir -p .docker-custom + echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV + + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -spec=vm-image-spec.yaml \ - -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \ - -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} - name: Pushing vm-compute-node image run: | - docker push 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + + - name: Remove custom docker config directory + if: always() + run: | + rm -rf .docker-custom test-images: - needs: [ check-permissions, tag, neon-image, compute-node-image, compute-tools-image ] - runs-on: [ self-hosted, gen3, small ] + needs: [ check-permissions, tag, neon-image, compute-node-image ] + strategy: + fail-fast: false + matrix: + arch: [ x64, arm64 ] + + runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -916,7 +1033,7 @@ jobs: - name: Verify image versions shell: bash # ensure no set -e for better error messages run: | - pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.tag.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" @@ -930,8 +1047,9 @@ jobs: exit 1 fi - - name: Verify docker-compose example - run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh + - name: Verify docker-compose example and test extensions + timeout-minutes: 20 + run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh - name: Print logs and clean up if: always() @@ -941,88 +1059,71 @@ jobs: promote-images: needs: [ check-permissions, tag, test-images, vm-compute-node-image ] - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - # Don't add if-condition here. - # The job should always be run because we have dependant other jobs that shouldn't be skipped + runs-on: ubuntu-22.04 + + env: + VERSIONS: v14 v15 v16 steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0 + - uses: docker/login-action@v3 + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json + - name: Login to dev ECR + uses: docker/login-action@v3 + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Copy vm-compute-node images to Docker Hub + - name: Copy vm-compute-node images to ECR run: | - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} vm-compute-node-v14 - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} vm-compute-node-v15 - crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} vm-compute-node-v16 + for version in ${VERSIONS}; do + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} + done - name: Add latest tag to images - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + if: github.ref_name == 'main' run: | - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} latest - crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest + for repo in neondatabase 369495373322.dkr.ecr.eu-central-1.amazonaws.com; do + docker buildx imagetools create -t $repo/neon:latest \ + $repo/neon:${{ needs.tag.outputs.build-tag }} - - name: Push images to production ECR - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' + docker buildx imagetools create -t $repo/compute-tools:latest \ + $repo/compute-tools:${{ needs.tag.outputs.build-tag }} + + for version in ${VERSIONS}; do + docker buildx imagetools create -t $repo/compute-node-${version}:latest \ + $repo/compute-node-${version}:${{ needs.tag.outputs.build-tag }} + + docker buildx imagetools create -t $repo/vm-compute-node-${version}:latest \ + $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }} + done + done + docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \ + neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} + + - name: Login to prod ECR + uses: docker/login-action@v3 + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + with: + registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }} + password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }} + + - name: Copy all images to prod ECR + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' run: | - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v16:latest - crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v16:latest - - - name: Configure Docker Hub login - run: | - # ECR Credential Helper & Docker Hub don't work together in config, hence reset - echo "" > /github/home/.docker/config.json - crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io - - - name: Push vm-compute-node to Docker Hub - run: | - crane push vm-compute-node-v14 neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} - crane push vm-compute-node-v15 neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} - crane push vm-compute-node-v16 neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} - - - name: Push latest tags to Docker Hub - if: | - (github.ref_name == 'main' || github.ref_name == 'release') && - github.event_name != 'workflow_dispatch' - run: | - crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/compute-node-v16:${{needs.tag.outputs.build-tag}} latest - crane tag neondatabase/vm-compute-node-v16:${{needs.tag.outputs.build-tag}} latest - - - name: Cleanup ECR folder - run: rm -rf ~/.ecr + for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do + docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ + 369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} + done trigger-custom-extensions-build-and-wait: needs: [ check-permissions, tag ] - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Set PR's status to pending and request a remote CI test run: | @@ -1096,7 +1197,7 @@ jobs: deploy: needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ] - if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' + if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy' runs-on: [ self-hosted, gen3, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest @@ -1116,7 +1217,7 @@ jobs: done - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: false fetch-depth: 0 @@ -1127,19 +1228,47 @@ jobs: run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - - # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions - gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true elif [[ "$GITHUB_REF_NAME" == "release" ]]; then - gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true + gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \ + -f deployPgSniRouter=false \ + -f deployProxy=false \ + -f deployStorage=true \ + -f deployStorageBroker=true \ + -f deployStorageController=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f deployPreprodRegion=true + + gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \ + -f deployStorage=true \ + -f deployStorageBroker=true \ + -f deployStorageController=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \ + -f deployPgSniRouter=true \ + -f deployProxy=true \ + -f deployStorage=false \ + -f deployStorageBroker=false \ + -f deployStorageController=false \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} \ + -f deployPreprodRegion=true + + gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \ + -f deployPgSniRouter=true \ + -f deployProxy=true \ + -f branch=main \ + -f dockerTag=${{needs.tag.outputs.build-tag}} else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" exit 1 fi - name: Create git tag - if: github.ref_name == 'release' - uses: actions/github-script@v6 + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' + uses: actions/github-script@v7 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 @@ -1151,9 +1280,10 @@ jobs: sha: context.sha, }) + # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok - name: Create GitHub release if: github.ref_name == 'release' - uses: actions/github-script@v6 + uses: actions/github-script@v7 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries retries: 5 @@ -1202,3 +1332,11 @@ jobs: time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME} done + + pin-build-tools-image: + needs: [ build-build-tools-image, promote-images, regress-tests ] + if: github.ref_name == 'main' + uses: ./.github/workflows/pin-build-tools-image.yml + with: + from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} + secrets: inherit diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml new file mode 100644 index 0000000000..807a9ef3bd --- /dev/null +++ b/.github/workflows/check-build-tools-image.yml @@ -0,0 +1,51 @@ +name: Check build-tools image + +on: + workflow_call: + outputs: + image-tag: + description: "build-tools image tag" + value: ${{ jobs.check-image.outputs.tag }} + found: + description: "Whether the image is found in the registry" + value: ${{ jobs.check-image.outputs.found }} + +defaults: + run: + shell: bash -euo pipefail {0} + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-image: + runs-on: ubuntu-22.04 + outputs: + tag: ${{ steps.get-build-tools-tag.outputs.image-tag }} + found: ${{ steps.check-image.outputs.found }} + + steps: + - uses: actions/checkout@v4 + + - name: Get build-tools image tag for the current commit + id: get-build-tools-tag + env: + IMAGE_TAG: | + ${{ hashFiles('Dockerfile.build-tools', + '.github/workflows/check-build-tools-image.yml', + '.github/workflows/build-build-tools-image.yml') }} + run: | + echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT + + - name: Check if such tag found in the registry + id: check-image + env: + IMAGE_TAG: ${{ steps.get-build-tools-tag.outputs.image-tag }} + run: | + if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then + found=true + else + found=false + fi + + echo "found=${found}" | tee -a $GITHUB_OUTPUT diff --git a/.github/workflows/check-permissions.yml b/.github/workflows/check-permissions.yml new file mode 100644 index 0000000000..9c42794797 --- /dev/null +++ b/.github/workflows/check-permissions.yml @@ -0,0 +1,36 @@ +name: Check Permissions + +on: + workflow_call: + inputs: + github-event-name: + required: true + type: string + +defaults: + run: + shell: bash -euo pipefail {0} + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +jobs: + check-permissions: + runs-on: ubuntu-22.04 + steps: + - name: Disallow CI runs on PRs from forks + if: | + inputs.github-event-name == 'pull_request' && + github.event.pull_request.head.repo.full_name != github.repository + run: | + if [ "${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association) }}" = "true" ]; then + MESSAGE="Please create a PR from a branch of ${GITHUB_REPOSITORY} instead of a fork" + else + MESSAGE="The PR should be reviewed and labelled with 'approved-for-ci-run' to trigger a CI run" + fi + + # TODO: use actions/github-script to post this message as a PR comment + echo >&2 "We don't run CI for PRs from forks" + echo >&2 "${MESSAGE}" + + exit 1 diff --git a/.github/workflows/cleanup-caches-by-a-branch.yml b/.github/workflows/cleanup-caches-by-a-branch.yml new file mode 100644 index 0000000000..0c074e36dc --- /dev/null +++ b/.github/workflows/cleanup-caches-by-a-branch.yml @@ -0,0 +1,32 @@ +# A workflow from +# https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#force-deleting-cache-entries + +name: cleanup caches by a branch +on: + pull_request: + types: + - closed + +jobs: + cleanup: + runs-on: ubuntu-22.04 + steps: + - name: Cleanup + run: | + gh extension install actions/gh-actions-cache + + echo "Fetching list of cache key" + cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH -L 100 | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $cacheKeysForPR + do + gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + BRANCH: refs/pull/${{ github.event.pull_request.number }}/merge diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index c6c2b7386a..7d2187e59c 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -20,13 +20,31 @@ env: COPT: '-Werror' jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name}} + + check-build-tools-image: + needs: [ check-permissions ] + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml + with: + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} + secrets: inherit + check-macos-build: + needs: [ check-permissions ] if: | contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' timeout-minutes: 90 - runs-on: macos-latest + runs-on: macos-14 env: # Use release build only, to have less debug info around @@ -57,24 +75,24 @@ jobs: - name: Cache postgres v14 build id: cache_pg_14 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v16 build id: cache_pg_16 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS run: | @@ -82,14 +100,14 @@ jobs: echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV - name: Cache cargo deps - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ~/.cargo/registry !~/.cargo/registry/src ~/.cargo/git target - key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' @@ -110,26 +128,30 @@ jobs: run: make walproposer-lib -j$(sysctl -n hw.ncpu) - name: Run cargo build - run: cargo build --all --release + run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release - name: Check that no warnings are produced run: ./run_clippy.sh check-linux-arm-build: + needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, small-arm64 ] env: # Use release build only, to have less debug info around # Hence keeping target/ (and general cache size) smaller BUILD_TYPE: release CARGO_FEATURES: --features testing - CARGO_FLAGS: --locked --release + CARGO_FLAGS: --release AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init steps: @@ -171,21 +193,21 @@ jobs: - name: Cache postgres v14 build id: cache_pg_14 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v14 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v15 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v16 build id: cache_pg_16 - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: pg_install/v16 key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} @@ -210,18 +232,20 @@ jobs: - name: Run cargo build run: | - mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests + mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) - name: Run cargo test + env: + NEXTEST_RETRIES: 3 run: | - cargo test $CARGO_FLAGS $CARGO_FEATURES + cargo nextest run $CARGO_FEATURES -j$(nproc) # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests export REMOTE_STORAGE_S3_REGION=eu-central-1 # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3 + cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc) # Run separate tests for real Azure Blob Storage # XXX: replace region with `eu-central-1`-like region @@ -231,16 +255,25 @@ jobs: export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}" export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}" # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now - cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure + cargo nextest run --package remote_storage --test test_real_azure -j$(nproc) check-codestyle-rust-arm: + needs: [ check-permissions, build-build-tools-image ] timeout-minutes: 90 - runs-on: [ self-hosted, dev, arm64 ] + runs-on: [ self-hosted, small-arm64 ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init + strategy: + fail-fast: false + matrix: + build_type: [ debug, release ] + steps: - name: Fix git ownership run: | @@ -277,41 +310,49 @@ jobs: exit 1 fi echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV + - name: Run cargo clippy (debug) + if: matrix.build_type == 'debug' run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS - name: Run cargo clippy (release) + if: matrix.build_type == 'release' run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS - name: Check documentation generation - run: cargo doc --workspace --no-deps --document-private-items + if: matrix.build_type == 'release' + run: cargo doc --workspace --no-deps --document-private-items -j$(nproc) env: RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links" # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run - name: Check formatting - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo fmt --all -- --check # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci - name: Check rust dependencies - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: | cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack # https://github.com/EmbarkStudios/cargo-deny - name: Check rust licenses/bans/advisories/sources - if: ${{ !cancelled() }} + if: ${{ !cancelled() && matrix.build_type == 'release' }} run: cargo deny check gather-rust-build-stats: + needs: [ check-permissions, build-build-tools-image ] if: | contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') || contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') || github.ref_name == 'main' - runs-on: [ self-hosted, gen3, large ] + runs-on: [ self-hosted, large ] container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init env: @@ -337,7 +378,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: cargo build --all --release --timings + run: cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats @@ -352,7 +393,7 @@ jobs: echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT - name: Publish build stats report - uses: actions/github-script@v6 + uses: actions/github-script@v7 env: REPORT_URL: ${{ steps.upload-stats.outputs.report-url }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml index 224b7b4a6d..fef3aec754 100644 --- a/.github/workflows/pg_clients.yml +++ b/.github/workflows/pg_clients.yml @@ -20,7 +20,7 @@ concurrency: jobs: test-postgres-client-libs: # TODO: switch to gen2 runner, requires docker - runs-on: [ ubuntu-latest ] + runs-on: ubuntu-22.04 env: DEFAULT_PG_VERSION: 14 @@ -28,7 +28,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: @@ -38,11 +38,10 @@ jobs: uses: snok/install-poetry@v1 - name: Cache poetry deps - id: cache_poetry - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} @@ -83,7 +82,7 @@ jobs: # It will be fixed after switching to gen2 runner - name: Upload python test logs if: always() - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: retention-days: 7 name: python-test-pg_clients-${{ runner.os }}-stage-logs diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml new file mode 100644 index 0000000000..024594532f --- /dev/null +++ b/.github/workflows/pin-build-tools-image.yml @@ -0,0 +1,73 @@ +name: 'Pin build-tools image' + +on: + workflow_dispatch: + inputs: + from-tag: + description: 'Source tag' + required: true + type: string + workflow_call: + inputs: + from-tag: + description: 'Source tag' + required: true + type: string + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: pin-build-tools-image-${{ inputs.from-tag }} + cancel-in-progress: false + +permissions: {} + +jobs: + tag-image: + runs-on: ubuntu-22.04 + + env: + FROM_TAG: ${{ inputs.from-tag }} + TO_TAG: pinned + + steps: + - name: Check if we really need to pin the image + id: check-manifests + run: | + docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json + docker manifest inspect neondatabase/build-tools:${TO_TAG} > ${TO_TAG}.json + + if diff ${FROM_TAG}.json ${TO_TAG}.json; then + skip=true + else + skip=false + fi + + echo "skip=${skip}" | tee -a $GITHUB_OUTPUT + + - uses: docker/login-action@v3 + if: steps.check-manifests.outputs.skip == 'false' + with: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + + - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub + if: steps.check-manifests.outputs.skip == 'false' + run: | + docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \ + neondatabase/build-tools:${FROM_TAG} + + - uses: docker/login-action@v3 + if: steps.check-manifests.outputs.skip == 'false' + with: + registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com + username: ${{ secrets.AWS_ACCESS_KEY_DEV }} + password: ${{ secrets.AWS_SECRET_KEY_DEV }} + + - name: Tag build-tools with `${{ env.TO_TAG }}` in ECR + if: steps.check-manifests.outputs.skip == 'false' + run: | + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ + neondatabase/build-tools:${FROM_TAG} diff --git a/.github/workflows/release-notify.yml b/.github/workflows/release-notify.yml index ba396dba74..8bd10e993c 100644 --- a/.github/workflows/release-notify.yml +++ b/.github/workflows/release-notify.yml @@ -19,7 +19,7 @@ on: jobs: notify: - runs-on: [ ubuntu-latest ] + runs-on: ubuntu-22.04 steps: - uses: neondatabase/dev-actions/release-pr-notify@main diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ba37c5827a..90a3aaaf2d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -2,12 +2,31 @@ name: Create Release Branch on: schedule: - - cron: '0 6 * * 1' + # It should be kept in sync with if-condition in jobs + - cron: '0 6 * * MON' # Storage release + - cron: '0 6 * * THU' # Proxy release workflow_dispatch: + inputs: + create-storage-release-branch: + type: boolean + description: 'Create Storage release PR' + required: false + create-proxy-release-branch: + type: boolean + description: 'Create Proxy release PR' + required: false + +# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. +permissions: {} + +defaults: + run: + shell: bash -euo pipefail {0} jobs: - create_release_branch: - runs-on: [ ubuntu-latest ] + create-storage-release-branch: + if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }} + runs-on: ubuntu-22.04 permissions: contents: write # for `git push` @@ -18,27 +37,67 @@ jobs: with: ref: main - - name: Get current date - id: date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + - name: Set environment variables + run: | + echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV + echo "RELEASE_BRANCH=rc/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV - name: Create release branch - run: git checkout -b releases/${{ steps.date.outputs.date }} + run: git checkout -b $RELEASE_BRANCH - name: Push new branch - run: git push origin releases/${{ steps.date.outputs.date }} + run: git push origin $RELEASE_BRANCH - name: Create pull request into release env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} run: | cat << EOF > body.md - ## Release ${{ steps.date.outputs.date }} + ## Storage & Compute release ${RELEASE_DATE} - **Please merge this PR using 'Create a merge commit'!** + **Please merge this Pull Request using 'Create a merge commit' button** EOF - gh pr create --title "Release ${{ steps.date.outputs.date }}" \ + gh pr create --title "Release ${RELEASE_DATE}" \ --body-file "body.md" \ - --head "releases/${{ steps.date.outputs.date }}" \ + --head "${RELEASE_BRANCH}" \ --base "release" + + create-proxy-release-branch: + if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }} + runs-on: ubuntu-22.04 + + permissions: + contents: write # for `git push` + + steps: + - name: Check out code + uses: actions/checkout@v4 + with: + ref: main + + - name: Set environment variables + run: | + echo "RELEASE_DATE=$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV + echo "RELEASE_BRANCH=rc/proxy/$(date +'%Y-%m-%d')" | tee -a $GITHUB_ENV + + - name: Create release branch + run: git checkout -b $RELEASE_BRANCH + + - name: Push new branch + run: git push origin $RELEASE_BRANCH + + - name: Create pull request into release + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + cat << EOF > body.md + ## Proxy release ${RELEASE_DATE} + + **Please merge this Pull Request using 'Create a merge commit' button** + EOF + + gh pr create --title "Proxy release ${RELEASE_DATE}" \ + --body-file "body.md" \ + --head "${RELEASE_BRANCH}" \ + --base "release-proxy" diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml new file mode 100644 index 0000000000..77928a343e --- /dev/null +++ b/.github/workflows/trigger-e2e-tests.yml @@ -0,0 +1,133 @@ +name: Trigger E2E Tests + +on: + pull_request: + types: + - ready_for_review + workflow_call: + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix + E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + +jobs: + cancel-previous-e2e-tests: + if: github.event_name == 'pull_request' + runs-on: ubuntu-22.04 + + steps: + - name: Cancel previous e2e-tests runs for this PR + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + gh workflow --repo neondatabase/cloud \ + run cancel-previous-in-concurrency-group.yml \ + --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}" + + tag: + runs-on: ubuntu-22.04 + outputs: + build-tag: ${{ steps.build-tag.outputs.tag }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get build tag + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + CURRENT_BRANCH: ${{ github.head_ref || github.ref_name }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + if [[ "$GITHUB_REF_NAME" == "main" ]]; then + echo "tag=$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release" ]]; then + echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then + echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + else + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') + echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT + fi + id: build-tag + + trigger-e2e-tests: + needs: [ tag ] + runs-on: ubuntu-22.04 + env: + TAG: ${{ needs.tag.outputs.build-tag }} + steps: + - name: check if ecr image are present + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + run: | + for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do + OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text) + if [ "$OUTPUT" == "" ]; then + echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT + exit 1 + fi + done + + - name: Set e2e-platforms + id: e2e-platforms + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Default set of platforms to run e2e tests on + platforms='["docker", "k8s"]' + + # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms. + # If the workflow run is not a pull request, add k8s-neonvm to the list. + if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then + for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do + case "$f" in + vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node) + platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') + ;; + *) + # no-op + ;; + esac + done + else + platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') + fi + + echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT + + - name: Set PR's status to pending and request a remote CI test + env: + E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }} + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud" + + gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \ + --method POST \ + --raw-field "state=pending" \ + --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \ + --raw-field "context=neon-cloud-e2e" + + gh workflow --repo ${REMOTE_REPO} \ + run testing.yml \ + --ref "main" \ + --raw-field "ci_job_name=neon-cloud-e2e" \ + --raw-field "commit_hash=$COMMIT_SHA" \ + --raw-field "remote_repo=${GITHUB_REPOSITORY}" \ + --raw-field "storage_image_tag=${TAG}" \ + --raw-field "compute_image_tag=${TAG}" \ + --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \ + --raw-field "e2e-platforms=${E2E_PLATFORMS}" diff --git a/.github/workflows/update_build_tools_image.yml b/.github/workflows/update_build_tools_image.yml deleted file mode 100644 index 88bab797b7..0000000000 --- a/.github/workflows/update_build_tools_image.yml +++ /dev/null @@ -1,130 +0,0 @@ -name: 'Update build tools image tag' - -# This workflow it used to update tag of build tools in ECR. -# The most common use case is adding/moving `pinned` tag to `${GITHUB_RUN_IT}` image. - -on: - workflow_dispatch: - inputs: - from-tag: - description: 'Source tag' - required: true - type: string - to-tag: - description: 'Destination tag' - required: true - type: string - default: 'pinned' - -defaults: - run: - shell: bash -euo pipefail {0} - -env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} - -permissions: {} - -jobs: - tag-image: - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - - env: - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: ${{ inputs.to-tag }} - outputs: - next-digest-buildtools: ${{ steps.next-digest.outputs.next-digest-buildtools }} - prev-digest-buildtools: ${{ steps.prev-digest.outputs.prev-digest-buildtools }} - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Get source image digest - id: next-digest - run: | - NEXT_DIGEST=$(crane digest ${IMAGE}:${FROM_TAG} || true) - if [ -z "${NEXT_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist" - exit 1 - fi - - echo "Current ${IMAGE}@${FROM_TAG} image is ${IMAGE}@${NEXT_DIGEST}" - echo "next-digest-buildtools=$NEXT_DIGEST" >> $GITHUB_OUTPUT - - - name: Get destination image digest (if already exists) - id: prev-digest - run: | - PREV_DIGEST=$(crane digest ${IMAGE}:${TO_TAG} || true) - if [ -z "${PREV_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${TO_TAG} does not exist (it's ok)" - else - echo >&2 "Current ${IMAGE}@${TO_TAG} image is ${IMAGE}@${PREV_DIGEST}" - - echo "prev-digest-buildtools=$PREV_DIGEST" >> $GITHUB_OUTPUT - fi - - - name: Tag image - run: | - crane tag "${IMAGE}:${FROM_TAG}" "${TO_TAG}" - - rollback-tag-image: - needs: tag-image - if: ${{ !success() }} - - runs-on: [ self-hosted, gen3, small ] - container: golang:1.19-bullseye - - env: - IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools - FROM_TAG: ${{ inputs.from-tag }} - TO_TAG: ${{ inputs.to-tag }} - - steps: - - name: Install Crane & ECR helper - run: | - go install github.com/google/go-containerregistry/cmd/crane@a54d64203cffcbf94146e04069aae4a97f228ee2 # v0.16.1 - go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@adf1bafd791ae7d4ff098108b1e91f36a4da5404 # v0.7.1 - - - name: Configure ECR login - run: | - mkdir /github/home/.docker/ - echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json - - - name: Restore previous tag if needed - run: | - NEXT_DIGEST="${{ needs.tag-image.outputs.next-digest-buildtools }}" - PREV_DIGEST="${{ needs.tag-image.outputs.prev-digest-buildtools }}" - - if [ -z "${NEXT_DIGEST}" ]; then - echo >&2 "Image ${IMAGE}:${FROM_TAG} does not exist, nothing to rollback" - exit 0 - fi - - if [ -z "${PREV_DIGEST}" ]; then - # I guess we should delete the tag here/untag the image, but crane does not support it - # - https://github.com/google/go-containerregistry/issues/999 - - echo >&2 "Image ${IMAGE}:${TO_TAG} did not exist, but it was created by the job, no need to rollback" - - exit 0 - fi - - CURRENT_DIGEST=$(crane digest "${IMAGE}:${TO_TAG}") - if [ "${CURRENT_DIGEST}" == "${NEXT_DIGEST}" ]; then - crane tag "${IMAGE}@${PREV_DIGEST}" "${TO_TAG}" - - echo >&2 "Successfully restored ${TO_TAG} tag from ${IMAGE}@${CURRENT_DIGEST} to ${IMAGE}@${PREV_DIGEST}" - else - echo >&2 "Image ${IMAGE}:${TO_TAG}@${CURRENT_DIGEST} is not required to be restored" - fi diff --git a/.gitignore b/.gitignore index 3f4495c9e7..2c38cdcc59 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ test_output/ neon.iml /.neon /integration_tests/.neon +compaction-suite-results.* # Coverage *.profraw diff --git a/CODEOWNERS b/CODEOWNERS index e384dc39f1..af2fa6088e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,12 +1,13 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute -/control_plane/ @neondatabase/compute @neondatabase/storage -/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage -/libs/postgres_ffi/ @neondatabase/compute +/storage_controller @neondatabase/storage +/libs/pageserver_api/ @neondatabase/storage +/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers /libs/remote_storage/ @neondatabase/storage /libs/safekeeper_api/ @neondatabase/safekeepers -/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute +/libs/vm_monitor/ @neondatabase/autoscaling /pageserver/ @neondatabase/storage /pgxn/ @neondatabase/compute +/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers /proxy/ @neondatabase/proxy /safekeeper/ @neondatabase/safekeepers /vendor/ @neondatabase/compute diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b318c295a3..164eb77f58 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,7 @@ ln -s ../../pre-commit.py .git/hooks/pre-commit This will run following checks on staged files before each commit: - `rustfmt` -- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks). +- checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks). There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date. @@ -54,6 +54,9 @@ _An instruction for maintainers_ - If and only if it looks **safe** (i.e. it doesn't contain any malicious code which could expose secrets or harm the CI), then: - Press the "Approve and run" button in GitHub UI - Add the `approved-for-ci-run` label to the PR + - Currently draft PR will skip e2e test (only for internal contributors). After turning the PR 'Ready to Review' CI will trigger e2e test + - Add `run-e2e-tests-in-draft` label to run e2e test in draft PR (override above behaviour) + - The `approved-for-ci-run` workflow will add `run-e2e-tests-in-draft` automatically to run e2e test for external contributors Repeat all steps after any change to the PR. - When the changes are ready to get merged — merge the original PR (not the internal one) @@ -71,16 +74,11 @@ We're using the following approach to make it work: For details see [`approved-for-ci-run.yml`](.github/workflows/approved-for-ci-run.yml) -## How do I add the "pinned" tag to an buildtools image? -We use the `pinned` tag for `Dockerfile.buildtools` build images in our CI/CD setup, currently adding the `pinned` tag is a manual operation. +## How do I make build-tools image "pinned" -You can call it from GitHub UI: https://github.com/neondatabase/neon/actions/workflows/update_build_tools_image.yml, -or using GitHub CLI: +It's possible to update the `pinned` tag of the `build-tools` image using the `pin-build-tools-image.yml` workflow. ```bash -gh workflow -R neondatabase/neon run update_build_tools_image.yml \ - -f from-tag=6254913013 \ - -f to-tag=pinned \ - -# Default `-f to-tag` is `pinned`, so the parameter can be omitted. -``` \ No newline at end of file +gh workflow -R neondatabase/neon run pin-build-tools-image.yml \ + -f from-tag=cc98d9b00d670f182c507ae3783342bd7e64c31e +``` diff --git a/Cargo.lock b/Cargo.lock index 5f544a05c6..cf8a0b3286 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" [[package]] name = "addr2line" -version = "0.19.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" dependencies = [ "gimli", ] @@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.5" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", @@ -241,7 +241,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -252,7 +252,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -270,6 +270,12 @@ dependencies = [ "critical-section", ] +[[package]] +name = "atomic-take" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3" + [[package]] name = "autocfg" version = "1.1.0" @@ -278,12 +284,11 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws-config" -version = "1.0.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80c950a809d39bc9480207cb1cfc879ace88ea7e3a4392a8e9999e45d6e5692e" +checksum = "baaa0be6ee7d90b775ae6ccb6d2ba182b91219ec2001f92338773a094246af1d" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-sdk-sso", "aws-sdk-ssooidc", @@ -298,20 +303,21 @@ dependencies = [ "bytes", "fastrand 2.0.0", "hex", - "http", - "hyper", + "http 0.2.9", + "hyper 0.14.26", "ring 0.17.6", "time", "tokio", "tracing", + "url", "zeroize", ] [[package]] name = "aws-credential-types" -version = "1.0.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1317e1a3514b103cf7d5828bbab3b4d30f56bd22d684f8568bc51b6cfbbb1c" +checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -319,30 +325,13 @@ dependencies = [ "zeroize", ] -[[package]] -name = "aws-http" -version = "0.60.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "361c4310fdce94328cc2d1ca0c8a48c13f43009c61d3367585685a50ca8c66b6" -dependencies = [ - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "http", - "http-body", - "pin-project-lite", - "tracing", -] - [[package]] name = "aws-runtime" -version = "1.0.1" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ed7ef604a15fd0d4d9e43701295161ea6b504b63c44990ead352afea2bc15e9" +checksum = "785da4a15e7b166b505fd577e4560c7a7cd8fbdf842eb1336cbcbf8944ce56f1" dependencies = [ "aws-credential-types", - "aws-http", "aws-sigv4", "aws-smithy-async", "aws-smithy-eventstream", @@ -350,21 +339,47 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", + "bytes", "fastrand 2.0.0", - "http", + "http 0.2.9", + "http-body 0.4.5", "percent-encoding", + "pin-project-lite", "tracing", "uuid", ] [[package]] -name = "aws-sdk-s3" -version = "1.4.0" +name = "aws-sdk-iam" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dcafc2fe52cc30b2d56685e2fa6a879ba50d79704594852112337a472ddbd24" +checksum = "b8ae76026bfb1b80a6aed0bb400c1139cd9c0563e26bce1986cd021c6a968c7b" dependencies = [ "aws-credential-types", - "aws-http", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "http 0.2.9", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bc5ce518d4b8d16e0408de7bdf1b3097cec61a7daa979750a208f8d9934386d" +dependencies = [ + "ahash", + "aws-credential-types", "aws-runtime", "aws-sigv4", "aws-smithy-async", @@ -378,23 +393,27 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", - "http", - "http-body", + "fastrand 2.0.0", + "hex", + "hmac", + "http 0.2.9", + "http-body 0.4.5", + "lru", "once_cell", "percent-encoding", - "regex", + "regex-lite", + "sha2", "tracing", "url", ] [[package]] name = "aws-sdk-sso" -version = "1.3.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0619ab97a5ca8982e7de073cdc66f93e5f6a1b05afc09e696bec1cb3607cd4df" +checksum = "ca3d6c4cba4e009391b72b0fcf12aff04ea3c9c3aa2ecaafa330326a8bd7e601" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-smithy-async", "aws-smithy-http", @@ -404,19 +423,19 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "http", - "regex", + "http 0.2.9", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sdk-ssooidc" -version = "1.3.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04b9f5474cc0f35d829510b2ec8c21e352309b46bf9633c5a81fb9321e9b1c7" +checksum = "73400dc239d14f63d932f4ca7b55af5e9ef1f857f7d70655249ccc287adb2570" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-smithy-async", "aws-smithy-http", @@ -426,19 +445,19 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "http", - "regex", + "http 0.2.9", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sdk-sts" -version = "1.3.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5700da387716ccfc30b27f44b008f457e1baca5b0f05b6b95455778005e3432a" +checksum = "10f8858308af76fba3e5ffcf1bb56af5471574d2bdfaf0159470c25bc2f760e5" dependencies = [ "aws-credential-types", - "aws-http", "aws-runtime", "aws-smithy-async", "aws-smithy-http", @@ -449,16 +468,17 @@ dependencies = [ "aws-smithy-types", "aws-smithy-xml", "aws-types", - "http", - "regex", + "http 0.2.9", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sigv4" -version = "1.0.1" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380adcc8134ad8bbdfeb2ace7626a869914ee266322965276cbc54066186d236" +checksum = "58b56f1cbe6fd4d0c2573df72868f20ab1c125ca9c9dbce17927a463433a2e57" dependencies = [ "aws-credential-types", "aws-smithy-eventstream", @@ -470,11 +490,11 @@ dependencies = [ "form_urlencoded", "hex", "hmac", - "http", + "http 0.2.9", + "http 1.1.0", "once_cell", "p256", "percent-encoding", - "regex", "ring 0.17.6", "sha2", "subtle", @@ -485,9 +505,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.0.2" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e37ca17d25fe1e210b6d4bdf59b81caebfe99f986201a1228cb5061233b4b13" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" dependencies = [ "futures-util", "pin-project-lite", @@ -496,9 +516,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.60.0" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5a373ec01aede3dd066ec018c1bc4e8f5dd11b2c11c59c8eef1a5c68101f397" +checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -506,8 +526,8 @@ dependencies = [ "crc32c", "crc32fast", "hex", - "http", - "http-body", + "http 0.2.9", + "http-body 0.4.5", "md-5", "pin-project-lite", "sha1", @@ -517,9 +537,9 @@ dependencies = [ [[package]] name = "aws-smithy-eventstream" -version = "0.60.0" +version = "0.60.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c669e1e5fc0d79561bf7a122b118bd50c898758354fe2c53eb8f2d31507cbc3" +checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858" dependencies = [ "aws-smithy-types", "bytes", @@ -528,9 +548,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.60.0" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b1de8aee22f67de467b2e3d0dd0fb30859dc53f579a63bd5381766b987db644" +checksum = "4a7de001a1b9a25601016d8057ea16e31a45fdca3751304c8edf4ad72e706c08" dependencies = [ "aws-smithy-eventstream", "aws-smithy-runtime-api", @@ -538,8 +558,8 @@ dependencies = [ "bytes", "bytes-utils", "futures-core", - "http", - "http-body", + "http 0.2.9", + "http-body 0.4.5", "once_cell", "percent-encoding", "pin-project-lite", @@ -549,18 +569,18 @@ dependencies = [ [[package]] name = "aws-smithy-json" -version = "0.60.0" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a46dd338dc9576d6a6a5b5a19bd678dcad018ececee11cf28ecd7588bd1a55c" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-query" -version = "0.60.0" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feb5b8c7a86d4b6399169670723b7e6f21a39fc833a30f5c5a2f997608178129" +checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" dependencies = [ "aws-smithy-types", "urlencoding", @@ -568,9 +588,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.0.2" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "273479291efc55e7b0bce985b139d86b6031adb8e50f65c1f712f20ba38f6388" +checksum = "c9ac79e9f3a4d576f3cd4a470a0275b138d9e7b11b1cd514a6858ae0a79dd5bb" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -578,29 +598,31 @@ dependencies = [ "aws-smithy-types", "bytes", "fastrand 2.0.0", - "h2", - "http", - "http-body", - "hyper", - "hyper-rustls", + "h2 0.3.26", + "http 0.2.9", + "http-body 0.4.5", + "http-body 1.0.0", + "hyper 0.14.26", + "hyper-rustls 0.24.0", "once_cell", "pin-project-lite", "pin-utils", - "rustls", + "rustls 0.21.11", "tokio", "tracing", ] [[package]] name = "aws-smithy-runtime-api" -version = "1.0.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6cebff0d977b6b6feed2fd07db52aac58ba3ccaf26cdd49f1af4add5061bef9" +checksum = "04ec42c2f5c0e7796a2848dde4d9f3bf8ce12ccbb3d5aa40c52fa0cdd61a1c47" dependencies = [ "aws-smithy-async", "aws-smithy-types", "bytes", - "http", + "http 0.2.9", + "http 1.1.0", "pin-project-lite", "tokio", "tracing", @@ -609,16 +631,19 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.0.2" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f48b3f27ddb40ab19892a5abda331f403e3cb877965e4e51171447807104af" +checksum = "baf98d97bba6ddaba180f1b1147e202d8fe04940403a95a3f826c790f931bbd1" dependencies = [ "base64-simd", "bytes", "bytes-utils", "futures-core", - "http", - "http-body", + "http 0.2.9", + "http 1.1.0", + "http-body 0.4.5", + "http-body 1.0.0", + "http-body-util", "itoa", "num-integer", "pin-project-lite", @@ -632,24 +657,24 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.0" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ec40d74a67fd395bc3f6b4ccbdf1543672622d905ef3f979689aea5b730cb95" +checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.0.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8403fc56b1f3761e8efe45771ddc1165e47ec3417c68e68a4519b5cb030159ca" +checksum = "5a43b56df2c529fe44cb4d92bd64d0479883fb9608ff62daede4df5405381814" dependencies = [ "aws-credential-types", "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "http", + "http 0.2.9", "rustc_version", "tracing", ] @@ -666,11 +691,11 @@ dependencies = [ "bitflags 1.3.2", "bytes", "futures-util", - "http", - "http-body", - "hyper", + "http 0.2.9", + "http-body 0.4.5", + "hyper 0.14.26", "itoa", - "matchit", + "matchit 0.7.0", "memchr", "mime", "percent-encoding", @@ -698,8 +723,8 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http", - "http-body", + "http 0.2.9", + "http-body 0.4.5", "mime", "rustversion", "tower-layer", @@ -708,9 +733,9 @@ dependencies = [ [[package]] name = "azure_core" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd" +checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7" dependencies = [ "async-trait", "base64 0.21.1", @@ -726,7 +751,7 @@ dependencies = [ "pin-project", "quick-xml", "rand 0.8.5", - "reqwest", + "reqwest 0.11.19", "rustc_version", "serde", "serde_json", @@ -738,9 +763,9 @@ dependencies = [ [[package]] name = "azure_identity" -version = "0.18.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8" +checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f" dependencies = [ "async-lock", "async-trait", @@ -751,16 +776,15 @@ dependencies = [ "pin-project", "serde", "time", - "tz-rs", "url", "uuid", ] [[package]] name = "azure_storage" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1" +checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266" dependencies = [ "RustyXML", "async-lock", @@ -777,9 +801,9 @@ dependencies = [ [[package]] name = "azure_storage_blobs" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872" +checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94" dependencies = [ "RustyXML", "azure_core", @@ -798,9 +822,9 @@ dependencies = [ [[package]] name = "azure_svc_blobstorage" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389" +checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b" dependencies = [ "azure_core", "bytes", @@ -814,15 +838,15 @@ dependencies = [ [[package]] name = "backtrace" -version = "0.3.67" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" dependencies = [ "addr2line", "cc", "cfg-if", "libc", - "miniz_oxide 0.6.2", + "miniz_oxide", "object", "rustc-demangle", ] @@ -851,6 +875,12 @@ version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "base64-simd" version = "0.8.0" @@ -867,6 +897,16 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +[[package]] +name = "bcder" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c627747a6774aab38beb35990d88309481378558875a41da1a4b2e373c906ef0" +dependencies = [ + "bytes", + "smallvec", +] + [[package]] name = "bincode" version = "1.3.3" @@ -895,7 +935,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.32", + "syn 2.0.52", "which", ] @@ -938,6 +978,12 @@ version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +[[package]] +name = "bytemuck" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5" + [[package]] name = "byteorder" version = "1.4.3" @@ -946,9 +992,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" dependencies = [ "serde", ] @@ -1025,9 +1071,9 @@ dependencies = [ [[package]] name = "chrono" -version = "0.4.31" +version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" dependencies = [ "android-tzdata", "iana-time-zone", @@ -1035,7 +1081,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.48.0", + "windows-targets 0.52.4", ] [[package]] @@ -1062,7 +1108,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" dependencies = [ "ciborium-io", - "half", + "half 1.8.2", ] [[package]] @@ -1106,10 +1152,10 @@ version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1118,22 +1164,26 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" -[[package]] -name = "close_fds" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed" -dependencies = [ - "cfg-if", - "libc", -] - [[package]] name = "colorchoice" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "combine" +version = "4.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4" +dependencies = [ + "bytes", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "tokio-util", +] + [[package]] name = "comfy-table" version = "6.1.4" @@ -1174,22 +1224,24 @@ dependencies = [ "compute_api", "flate2", "futures", - "hyper", - "nix 0.26.2", + "hyper 0.14.26", + "nix 0.27.1", "notify", "num_cpus", "opentelemetry", "postgres", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "rust-ini", "serde", "serde_json", "signal-hook", "tar", + "thiserror", "tokio", "tokio-postgres", + "tokio-stream", "tokio-util", "toml_edit", "tracing", @@ -1238,12 +1290,6 @@ dependencies = [ "tiny-keccak", ] -[[package]] -name = "const_fn" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935" - [[package]] name = "const_format" version = "0.2.30" @@ -1290,8 +1336,10 @@ dependencies = [ "futures", "git-version", "hex", - "hyper", - "nix 0.26.2", + "humantime", + "humantime-serde", + "hyper 0.14.26", + "nix 0.27.1", "once_cell", "pageserver_api", "pageserver_client", @@ -1299,8 +1347,9 @@ dependencies = [ "postgres_backend", "postgres_connection", "regex", - "reqwest", + "reqwest 0.12.4", "safekeeper_api", + "scopeguard", "serde", "serde_json", "serde_with", @@ -1311,6 +1360,7 @@ dependencies = [ "tokio-postgres", "tokio-util", "toml", + "toml_edit", "tracing", "url", "utils", @@ -1344,9 +1394,9 @@ dependencies = [ [[package]] name = "crc32c" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" +checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2" dependencies = [ "rustc_version", ] @@ -1414,36 +1464,28 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.14" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset 0.8.0", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.15" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" -dependencies = [ - "cfg-if", -] +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "crossterm" @@ -1529,7 +1571,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1540,7 +1582,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1550,7 +1592,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d" dependencies = [ "cfg-if", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core 0.9.8", @@ -1582,6 +1624,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "der" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" +dependencies = [ + "const-oid", + "zeroize", +] + [[package]] name = "der-parser" version = "8.2.0" @@ -1596,6 +1648,69 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "desim" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "hex", + "parking_lot 0.12.1", + "rand 0.8.5", + "scopeguard", + "smallvec", + "tracing", + "utils", + "workspace_hack", +] + +[[package]] +name = "diesel" +version = "2.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62c6fcf842f17f8c78ecf7c81d75c5ce84436b41ee07e03f490fbb5f5a8731d8" +dependencies = [ + "bitflags 2.4.1", + "byteorder", + "diesel_derives", + "itoa", + "pq-sys", + "r2d2", + "serde_json", +] + +[[package]] +name = "diesel_derives" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8337737574f55a468005a83499da720f20c65586241ffea339db9ecdfd2b44" +dependencies = [ + "diesel_table_macro_syntax", + "proc-macro2", + "quote", + "syn 2.0.52", +] + +[[package]] +name = "diesel_migrations" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6036b3f0120c5961381b570ee20a02432d7e2d27ea60de9578799cf9156914ac" +dependencies = [ + "diesel", + "migrations_internals", + "migrations_macros", +] + +[[package]] +name = "diesel_table_macro_syntax" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc5557efc453706fed5e4fa85006fe9817c224c3f480a34c7e5959fd700921c5" +dependencies = [ + "syn 2.0.52", +] + [[package]] name = "digest" version = "0.10.7" @@ -1615,7 +1730,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1639,10 +1754,10 @@ version = "0.14.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" dependencies = [ - "der", + "der 0.6.1", "elliptic-curve", "rfc6979", - "signature", + "signature 1.6.4", ] [[package]] @@ -1659,7 +1774,7 @@ checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" dependencies = [ "base16ct", "crypto-bigint 0.4.9", - "der", + "der 0.6.1", "digest", "ff", "generic-array", @@ -1707,6 +1822,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb" dependencies = [ "enumset_derive", + "serde", ] [[package]] @@ -1718,7 +1834,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -1735,24 +1851,19 @@ dependencies = [ ] [[package]] -name = "errno" -version = "0.3.1" +name = "equivalent" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" -dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] -name = "errno-dragonfly" -version = "0.1.2" +name = "errno" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -1826,13 +1937,13 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.21" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" +checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.2.16", + "redox_syscall 0.3.5", "windows-sys 0.48.0", ] @@ -1849,7 +1960,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" dependencies = [ "crc32fast", - "miniz_oxide 0.7.1", + "miniz_oxide", ] [[package]] @@ -1858,21 +1969,6 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - [[package]] name = "form_urlencoded" version = "1.1.0" @@ -1882,6 +1978,27 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "framed-websockets" +version = "0.1.0" +source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127" +dependencies = [ + "base64 0.21.1", + "bytemuck", + "bytes", + "futures-core", + "futures-sink", + "http-body-util", + "hyper 1.2.0", + "hyper-util", + "pin-project", + "rand 0.8.5", + "sha1", + "thiserror", + "tokio", + "tokio-util", +] + [[package]] name = "fs2" version = "0.4.3" @@ -1918,9 +2035,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", "futures-sink", @@ -1928,9 +2045,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] name = "futures-executor" @@ -1945,9 +2062,9 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] name = "futures-lite" @@ -1966,26 +2083,26 @@ dependencies = [ [[package]] name = "futures-macro" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-timer" @@ -1995,9 +2112,9 @@ checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-channel", "futures-core", @@ -2047,9 +2164,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.27.2" +version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "git-version" @@ -2092,17 +2209,36 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.19" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" dependencies = [ "bytes", "fnv", "futures-core", "futures-sink", "futures-util", - "http", - "indexmap", + "http 0.2.9", + "indexmap 2.0.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "h2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "816ec7294445779408f36fe57bc5b7fc1cf59664059096c65f905c1c61f58069" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 1.1.0", + "indexmap 2.0.1", "slab", "tokio", "tokio-util", @@ -2115,6 +2251,17 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + [[package]] name = "hash32" version = "0.3.1" @@ -2141,9 +2288,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.0" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash", "allocator-api2", @@ -2151,11 +2298,11 @@ dependencies = [ [[package]] name = "hashlink" -version = "0.8.2" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa" +checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" dependencies = [ - "hashbrown 0.13.2", + "hashbrown 0.14.5", ] [[package]] @@ -2190,6 +2337,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.3.3" @@ -2241,6 +2394,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "hostname" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9c7c7c8ac16c798734b8a24560c1362120597c40d5e1459f09498f8f6c8f2ba" +dependencies = [ + "cfg-if", + "libc", + "windows 0.52.0", +] + [[package]] name = "http" version = "0.2.9" @@ -2252,6 +2416,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http-body" version = "0.4.5" @@ -2259,7 +2434,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", - "http", + "http 0.2.9", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", "pin-project-lite", ] @@ -2321,9 +2519,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", - "http", - "http-body", + "h2 0.3.26", + "http 0.2.9", + "http-body 0.4.5", "httparse", "httpdate", "itoa", @@ -2335,19 +2533,57 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2 0.4.4", + "http 1.1.0", + "http-body 1.0.0", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + [[package]] name = "hyper-rustls" version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ - "http", - "hyper", + "http 0.2.9", + "hyper 0.14.26", "log", - "rustls", - "rustls-native-certs", + "rustls 0.21.11", + "rustls-native-certs 0.6.2", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", +] + +[[package]] +name = "hyper-rustls" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" +dependencies = [ + "futures-util", + "http 1.1.0", + "hyper 1.2.0", + "hyper-util", + "rustls 0.22.4", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.25.0", + "tower-service", ] [[package]] @@ -2356,36 +2592,30 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.26", "pin-project-lite", "tokio", "tokio-io-timeout", ] [[package]] -name = "hyper-tls" -version = "0.5.0" +name = "hyper-util" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" dependencies = [ "bytes", - "hyper", - "native-tls", - "tokio", - "tokio-native-tls", -] - -[[package]] -name = "hyper-tungstenite" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9" -dependencies = [ - "hyper", + "futures-channel", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "hyper 1.2.0", "pin-project-lite", + "socket2 0.5.5", "tokio", - "tokio-tungstenite", - "tungstenite", + "tower", + "tower-service", + "tracing", ] [[package]] @@ -2399,7 +2629,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows", + "windows 0.48.0", ] [[package]] @@ -2438,6 +2668,16 @@ dependencies = [ "serde", ] +[[package]] +name = "indexmap" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" +dependencies = [ + "equivalent", + "hashbrown 0.14.5", +] + [[package]] name = "infer" version = "0.2.3" @@ -2506,6 +2746,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "io-uring" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460648e47a07a43110fbfa2e0b14afb2be920093c31e5dccc50e49568e099762" +dependencies = [ + "bitflags 1.3.2", + "libc", +] + [[package]] name = "ipnet" version = "2.9.0" @@ -2550,9 +2800,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.63" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] @@ -2565,7 +2815,7 @@ checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4" dependencies = [ "base64 0.21.1", "js-sys", - "pem 3.0.3", + "pem", "ring 0.17.6", "serde", "serde_json", @@ -2592,6 +2842,16 @@ dependencies = [ "libc", ] +[[package]] +name = "lasso" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2" +dependencies = [ + "dashmap", + "hashbrown 0.13.2", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -2604,6 +2864,17 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "leaky-bucket" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853" +dependencies = [ + "parking_lot 0.12.1", + "tokio", + "tracing", +] + [[package]] name = "libc" version = "0.2.150" @@ -2620,6 +2891,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + [[package]] name = "linux-raw-sys" version = "0.1.4" @@ -2632,6 +2909,18 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + +[[package]] +name = "linux-raw-sys" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4" + [[package]] name = "lock_api" version = "0.4.10" @@ -2648,6 +2937,15 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lru" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -2669,6 +2967,12 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40" +[[package]] +name = "matchit" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "540f1c43aed89909c0cc0cc604e3bb2f7e7a341a3728a9e6cfe760e733cd11ed" + [[package]] name = "md-5" version = "0.10.5" @@ -2684,6 +2988,47 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" +[[package]] +name = "measured" +version = "0.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" +dependencies = [ + "bytes", + "crossbeam-utils", + "hashbrown 0.14.5", + "itoa", + "lasso", + "measured-derive", + "memchr", + "parking_lot 0.12.1", + "rustc-hash", + "ryu", +] + +[[package]] +name = "measured-derive" +version = "0.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.52", +] + +[[package]] +name = "measured-process" +version = "0.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000" +dependencies = [ + "libc", + "measured", + "procfs 0.16.0", +] + [[package]] name = "memchr" version = "2.6.4" @@ -2708,48 +3053,65 @@ dependencies = [ "autocfg", ] +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + [[package]] name = "metrics" version = "0.1.0" dependencies = [ "chrono", "libc", + "measured", + "measured-process", "once_cell", + "procfs 0.14.2", "prometheus", + "rand 0.8.5", + "rand_distr", + "twox-hash", "workspace_hack", ] +[[package]] +name = "migrations_internals" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f23f71580015254b020e856feac3df5878c2c7a8812297edd6c0a485ac9dada" +dependencies = [ + "serde", + "toml", +] + +[[package]] +name = "migrations_macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cce3325ac70e67bbab5bd837a31cae01f1a6db64e0e744a33cb03a543469ef08" +dependencies = [ + "migrations_internals", + "proc-macro2", + "quote", +] + [[package]] name = "mime" version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" -[[package]] -name = "mime_guess" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" -dependencies = [ - "mime", - "unicase", -] - [[package]] name = "minimal-lexical" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" -[[package]] -name = "miniz_oxide" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" -dependencies = [ - "adler", -] - [[package]] name = "miniz_oxide" version = "0.7.1" @@ -2761,9 +3123,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", @@ -2777,24 +3139,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "native-tls" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" -dependencies = [ - "lazy_static", - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", -] - [[package]] name = "nix" version = "0.25.1" @@ -2809,16 +3153,27 @@ dependencies = [ [[package]] name = "nix" -version = "0.26.2" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" dependencies = [ "bitflags 1.3.2", "cfg-if", "libc", "memoffset 0.7.1", "pin-utils", - "static_assertions", +] + +[[package]] +name = "nix" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +dependencies = [ + "bitflags 2.4.1", + "cfg-if", + "libc", + "memoffset 0.9.0", ] [[package]] @@ -2833,20 +3188,21 @@ dependencies = [ [[package]] name = "notify" -version = "5.2.0" +version = "6.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486" +checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.4.1", "crossbeam-channel", "filetime", "fsevent-sys", "inotify 0.9.6", "kqueue", "libc", + "log", "mio", "walkdir", - "windows-sys 0.45.0", + "windows-sys 0.48.0", ] [[package]] @@ -2858,6 +3214,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num" version = "0.4.1" @@ -2930,6 +3296,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -2942,15 +3309,6 @@ dependencies = [ "libc", ] -[[package]] -name = "num_threads" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" -dependencies = [ - "libc", -] - [[package]] name = "oauth2" version = "4.4.2" @@ -2960,7 +3318,7 @@ dependencies = [ "base64 0.13.1", "chrono", "getrandom 0.2.11", - "http", + "http 0.2.9", "rand 0.8.5", "serde", "serde_json", @@ -2972,9 +3330,9 @@ dependencies = [ [[package]] name = "object" -version = "0.30.3" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] @@ -3000,55 +3358,17 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "openssl" -version = "0.10.60" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" -dependencies = [ - "bitflags 2.4.1", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.32", -] - [[package]] name = "openssl-probe" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" -[[package]] -name = "openssl-sys" -version = "0.9.96" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" -dependencies = [ - "cc", - "libc", - "pkg-config", - "vcpkg", -] - [[package]] name = "opentelemetry" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f" +checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54" dependencies = [ "opentelemetry_api", "opentelemetry_sdk", @@ -3056,67 +3376,69 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906" +checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b" dependencies = [ "async-trait", "bytes", - "http", + "http 0.2.9", "opentelemetry_api", - "reqwest", + "reqwest 0.11.19", ] [[package]] name = "opentelemetry-otlp" -version = "0.12.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca" +checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" dependencies = [ "async-trait", - "futures", - "futures-util", - "http", - "opentelemetry", + "futures-core", + "http 0.2.9", "opentelemetry-http", "opentelemetry-proto", + "opentelemetry-semantic-conventions", + "opentelemetry_api", + "opentelemetry_sdk", "prost", - "reqwest", + "reqwest 0.11.19", "thiserror", + "tokio", + "tonic", ] [[package]] name = "opentelemetry-proto" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c" +checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb" dependencies = [ - "futures", - "futures-util", - "opentelemetry", + "opentelemetry_api", + "opentelemetry_sdk", "prost", - "tonic 0.8.3", + "tonic", ] [[package]] name = "opentelemetry-semantic-conventions" -version = "0.11.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5" +checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269" dependencies = [ "opentelemetry", ] [[package]] name = "opentelemetry_api" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2" +checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b" dependencies = [ - "fnv", "futures-channel", "futures-util", - "indexmap", + "indexmap 1.9.3", + "js-sys", "once_cell", "pin-project-lite", "thiserror", @@ -3125,21 +3447,22 @@ dependencies = [ [[package]] name = "opentelemetry_sdk" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1" +checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026" dependencies = [ "async-trait", "crossbeam-channel", - "dashmap", - "fnv", "futures-channel", "futures-executor", "futures-util", "once_cell", "opentelemetry_api", + "ordered-float 3.9.2", "percent-encoding", "rand 0.8.5", + "regex", + "serde_json", "thiserror", "tokio", "tokio-stream", @@ -3155,13 +3478,22 @@ dependencies = [ ] [[package]] -name = "ordered-multimap" -version = "0.7.1" +name = "ordered-float" +version = "3.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f" +checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ordered-multimap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" dependencies = [ "dlv-list", - "hashbrown 0.14.0", + "hashbrown 0.14.5", ] [[package]] @@ -3181,6 +3513,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "p256" version = "0.11.1" @@ -3197,18 +3535,19 @@ name = "pagebench" version = "0.1.0" dependencies = [ "anyhow", + "camino", "clap", "futures", "hdrhistogram", "humantime", "humantime-serde", - "pageserver", "pageserver_api", "pageserver_client", "rand 0.8.5", "serde", "serde_json", "tokio", + "tokio-util", "tracing", "utils", "workspace_hack", @@ -3223,12 +3562,18 @@ dependencies = [ "camino", "clap", "git-version", + "humantime", "pageserver", + "pageserver_api", "postgres_ffi", + "remote_storage", "serde", "serde_json", "svg_fmt", + "thiserror", "tokio", + "tokio-util", + "toml_edit", "utils", "workspace_hack", ] @@ -3238,6 +3583,7 @@ name = "pageserver" version = "0.1.0" dependencies = [ "anyhow", + "arc-swap", "async-compression", "async-stream", "async-trait", @@ -3247,7 +3593,6 @@ dependencies = [ "camino-tempfile", "chrono", "clap", - "close_fds", "const_format", "consumption_metrics", "crc32c", @@ -3264,15 +3609,17 @@ dependencies = [ "hex-literal", "humantime", "humantime-serde", - "hyper", + "hyper 0.14.26", "itertools", + "leaky-bucket", "md5", "metrics", - "nix 0.26.2", + "nix 0.27.1", "num-traits", "num_cpus", "once_cell", "pageserver_api", + "pageserver_compaction", "pin-project-lite", "postgres", "postgres-protocol", @@ -3281,10 +3628,11 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", + "procfs 0.14.2", "rand 0.8.5", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "rpds", "scopeguard", "serde", @@ -3298,9 +3646,11 @@ dependencies = [ "strum_macros", "svg_fmt", "sync_wrapper", + "sysinfo", "tenant_size_model", "thiserror", "tokio", + "tokio-epoll-uring", "tokio-io-timeout", "tokio-postgres", "tokio-stream", @@ -3308,6 +3658,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "twox-hash", "url", "utils", "walkdir", @@ -3322,9 +3673,13 @@ dependencies = [ "bincode", "byteorder", "bytes", + "chrono", "const_format", "enum-map", "hex", + "humantime", + "humantime-serde", + "itertools", "postgres_ffi", "rand 0.8.5", "serde", @@ -3347,7 +3702,7 @@ dependencies = [ "futures", "pageserver_api", "postgres", - "reqwest", + "reqwest 0.12.4", "serde", "thiserror", "tokio", @@ -3358,6 +3713,52 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_compaction" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-compression", + "async-stream", + "byteorder", + "bytes", + "chrono", + "clap", + "const_format", + "consumption_metrics", + "criterion", + "crossbeam-utils", + "either", + "fail", + "flate2", + "futures", + "git-version", + "hex", + "hex-literal", + "humantime", + "humantime-serde", + "itertools", + "metrics", + "once_cell", + "pageserver_api", + "pin-project-lite", + "rand 0.8.5", + "smallvec", + "svg_fmt", + "sync_wrapper", + "thiserror", + "tokio", + "tokio-io-timeout", + "tokio-util", + "tracing", + "tracing-error", + "tracing-subscriber", + "url", + "utils", + "walkdir", + "workspace_hack", +] + [[package]] name = "parking" version = "2.1.1" @@ -3414,13 +3815,14 @@ dependencies = [ [[package]] name = "parquet" -version = "49.0.0" -source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +version = "51.0.0" +source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" dependencies = [ "ahash", "bytes", "chrono", - "hashbrown 0.14.0", + "half 2.4.1", + "hashbrown 0.14.5", "num", "num-bigint", "paste", @@ -3428,17 +3830,18 @@ dependencies = [ "thrift", "twox-hash", "zstd", + "zstd-sys", ] [[package]] name = "parquet_derive" -version = "49.0.0" -source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9" +version = "51.0.0" +source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829" dependencies = [ "parquet", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3460,9 +3863,9 @@ checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "pbkdf2" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" dependencies = [ "digest", "hmac", @@ -3476,16 +3879,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" -[[package]] -name = "pem" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b13fe415cdf3c8e44518e18a7c95a13431d9bdf6d15367d82b23c377fdd441a" -dependencies = [ - "base64 0.21.1", - "serde", -] - [[package]] name = "pem" version = "3.0.3" @@ -3509,7 +3902,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" dependencies = [ "fixedbitset", - "indexmap", + "indexmap 1.9.3", ] [[package]] @@ -3547,7 +3940,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3568,8 +3961,8 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" dependencies = [ - "der", - "spki", + "der 0.6.1", + "spki 0.6.0", ] [[package]] @@ -3609,7 +4002,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -3619,21 +4012,10 @@ dependencies = [ "tokio-postgres", ] -[[package]] -name = "postgres-native-tls" -version = "0.5.0" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" -dependencies = [ - "native-tls", - "tokio", - "tokio-native-tls", - "tokio-postgres", -] - [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "base64 0.20.0", "byteorder", @@ -3646,12 +4028,13 @@ dependencies = [ "rand 0.8.5", "sha2", "stringprep", + "tokio", ] [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "bytes", "fallible-iterator", @@ -3668,14 +4051,14 @@ dependencies = [ "futures", "once_cell", "pq_proto", - "rustls", - "rustls-pemfile", + "rustls 0.22.4", + "rustls-pemfile 2.1.1", "serde", "thiserror", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.25.0", "tracing", "workspace_hack", ] @@ -3722,15 +4105,26 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "pq-sys" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31c0052426df997c0cbd30789eb44ca097e3541717a7b8fa36b1c464ee7edebd" +dependencies = [ + "vcpkg", +] + [[package]] name = "pq_proto" version = "0.1.0" dependencies = [ "byteorder", "bytes", + "itertools", "pin-project-lite", "postgres-protocol", "rand 0.8.5", + "serde", "thiserror", "tokio", "tracing", @@ -3754,7 +4148,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -3765,9 +4159,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -3780,11 +4174,36 @@ checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" dependencies = [ "bitflags 1.3.2", "byteorder", + "chrono", + "flate2", "hex", "lazy_static", "rustix 0.36.16", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.1", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.28", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.1", + "hex", +] + [[package]] name = "prometheus" version = "0.13.3" @@ -3797,7 +4216,7 @@ dependencies = [ "libc", "memchr", "parking_lot 0.12.1", - "procfs", + "procfs 0.14.2", "thiserror", ] @@ -3818,7 +4237,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", - "heck", + "heck 0.4.1", "itertools", "lazy_static", "log", @@ -3859,8 +4278,15 @@ dependencies = [ name = "proxy" version = "0.1.0" dependencies = [ + "ahash", "anyhow", + "async-compression", "async-trait", + "atomic-take", + "aws-config", + "aws-sdk-iam", + "aws-sigv4", + "aws-types", "base64 0.13.1", "bstr", "bytes", @@ -3869,22 +4295,32 @@ dependencies = [ "chrono", "clap", "consumption_metrics", + "crossbeam-deque", "dashmap", + "env_logger", + "fallible-iterator", + "framed-websockets", "futures", "git-version", - "hashbrown 0.13.2", + "hashbrown 0.14.5", "hashlink", "hex", "hmac", - "hostname", + "hostname 0.3.1", + "http 1.1.0", + "http-body-util", "humantime", - "hyper", - "hyper-tungstenite", + "humantime-serde", + "hyper 0.14.26", + "hyper 1.2.0", + "hyper-util", + "indexmap 2.0.1", "ipnet", "itertools", + "lasso", "md5", + "measured", "metrics", - "native-tls", "once_cell", "opentelemetry", "parking_lot 0.12.1", @@ -3892,47 +4328,54 @@ dependencies = [ "parquet_derive", "pbkdf2", "pin-project-lite", - "postgres-native-tls", "postgres-protocol", "postgres_backend", "pq_proto", "prometheus", "rand 0.8.5", + "rand_distr", "rcgen", + "redis", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", "reqwest-retry", "reqwest-tracing", "routerify", "rstest", "rustc-hash", - "rustls", - "rustls-pemfile", + "rustls 0.22.4", + "rustls-native-certs 0.7.0", + "rustls-pemfile 2.1.1", "scopeguard", "serde", "serde_json", "sha2", + "smallvec", "smol_str", "socket2 0.5.5", - "sync_wrapper", + "subtle", "task-local-extensions", "thiserror", - "tls-listener", + "tikv-jemalloc-ctl", + "tikv-jemallocator", "tokio", "tokio-postgres", "tokio-postgres-rustls", - "tokio-rustls", + "tokio-rustls 0.25.0", + "tokio-tungstenite", "tokio-util", + "tower-service", "tracing", "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "url", + "urlencoding", "utils", "uuid", - "webpki-roots 0.25.2", + "walkdir", "workspace_hack", "x509-parser", ] @@ -3949,13 +4392,24 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.32" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f3b39ccfb720540debaa0164757101c08ecb8d326b15358ce76a62c7e85965" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] +[[package]] +name = "r2d2" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93" +dependencies = [ + "log", + "parking_lot 0.12.1", + "scheduled-thread-pool", +] + [[package]] name = "rand" version = "0.7.3" @@ -4018,6 +4472,16 @@ dependencies = [ "getrandom 0.2.11", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + [[package]] name = "rand_hc" version = "0.2.0" @@ -4051,16 +4515,42 @@ dependencies = [ [[package]] name = "rcgen" -version = "0.11.1" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976" +checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1" dependencies = [ - "pem 2.0.1", - "ring 0.16.20", + "pem", + "ring 0.17.6", "time", "yasna", ] +[[package]] +name = "redis" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb" +dependencies = [ + "async-trait", + "bytes", + "combine", + "futures-util", + "itoa", + "percent-encoding", + "pin-project-lite", + "rustls 0.22.4", + "rustls-native-certs 0.7.0", + "rustls-pemfile 2.1.1", + "rustls-pki-types", + "ryu", + "sha1_smol", + "socket2 0.5.5", + "tokio", + "tokio-rustls 0.25.0", + "tokio-util", + "url", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -4111,6 +4601,12 @@ dependencies = [ "regex-syntax 0.8.2", ] +[[package]] +name = "regex-lite" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" + [[package]] name = "regex-syntax" version = "0.6.29" @@ -4150,7 +4646,8 @@ dependencies = [ "futures", "futures-util", "http-types", - "hyper", + "humantime", + "hyper 0.14.26", "itertools", "metrics", "once_cell", @@ -4159,8 +4656,10 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "sync_wrapper", "test-context", "tokio", + "tokio-stream", "tokio-util", "toml_edit", "tracing", @@ -4179,73 +4678,112 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-rustls", - "hyper-tls", + "h2 0.3.26", + "http 0.2.9", + "http-body 0.4.5", + "hyper 0.14.26", + "hyper-rustls 0.24.0", "ipnet", "js-sys", "log", "mime", - "mime_guess", - "native-tls", "once_cell", "percent-encoding", "pin-project-lite", - "rustls", - "rustls-pemfile", + "rustls 0.21.11", + "rustls-pemfile 1.0.2", "serde", "serde_json", "serde_urlencoded", "tokio", - "tokio-native-tls", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", - "wasm-streams", + "wasm-streams 0.3.0", "web-sys", "webpki-roots 0.25.2", - "winreg", + "winreg 0.50.0", +] + +[[package]] +name = "reqwest" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "hyper 1.2.0", + "hyper-rustls 0.26.0", + "hyper-util", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls 0.22.4", + "rustls-pemfile 2.1.1", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls 0.25.0", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams 0.4.0", + "web-sys", + "webpki-roots 0.26.1", + "winreg 0.52.0", ] [[package]] name = "reqwest-middleware" -version = "0.2.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d" +checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01" dependencies = [ "anyhow", "async-trait", - "http", - "reqwest", + "http 1.1.0", + "reqwest 0.12.4", "serde", - "task-local-extensions", "thiserror", + "tower-service", ] [[package]] name = "reqwest-retry" -version = "0.2.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48d0fd6ef4c6d23790399fe15efc8d12cd9f3d4133958f9bd7801ee5cbaec6c4" +checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5" dependencies = [ "anyhow", "async-trait", "chrono", "futures", "getrandom 0.2.11", - "http", - "hyper", + "http 1.1.0", + "hyper 1.2.0", "parking_lot 0.11.2", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", "retry-policies", - "task-local-extensions", "tokio", "tracing", "wasm-timer", @@ -4253,27 +4791,27 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.4.5" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8" +checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3" dependencies = [ "anyhow", "async-trait", "getrandom 0.2.11", - "matchit", + "http 1.1.0", + "matchit 0.8.2", "opentelemetry", - "reqwest", + "reqwest 0.12.4", "reqwest-middleware", - "task-local-extensions", "tracing", "tracing-opentelemetry", ] [[package]] name = "retry-policies" -version = "0.1.2" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b" +checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810" dependencies = [ "anyhow", "chrono", @@ -4326,8 +4864,8 @@ version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "496c1d3718081c45ba9c31fbfc07417900aa96f4070ff90dc29961836b7a9945" dependencies = [ - "http", - "hyper", + "http 0.2.9", + "hyper 0.14.26", "lazy_static", "percent-encoding", "regex", @@ -4367,7 +4905,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.32", + "syn 2.0.52", "unicode-ident", ] @@ -4440,10 +4978,23 @@ dependencies = [ ] [[package]] -name = "rustls" -version = "0.21.9" +name = "rustix" +version = "0.38.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys 0.4.13", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.21.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" dependencies = [ "log", "ring 0.17.6", @@ -4451,6 +5002,20 @@ dependencies = [ "sct", ] +[[package]] +name = "rustls" +version = "0.22.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432" +dependencies = [ + "log", + "ring 0.17.6", + "rustls-pki-types", + "rustls-webpki 0.102.2", + "subtle", + "zeroize", +] + [[package]] name = "rustls-native-certs" version = "0.6.2" @@ -4458,7 +5023,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0167bac7a9f490495f3c33013e7722b53cb087ecbe082fb0c6387c96f634ea50" dependencies = [ "openssl-probe", - "rustls-pemfile", + "rustls-pemfile 1.0.2", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-native-certs" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792" +dependencies = [ + "openssl-probe", + "rustls-pemfile 2.1.1", + "rustls-pki-types", "schannel", "security-framework", ] @@ -4472,6 +5050,22 @@ dependencies = [ "base64 0.21.1", ] +[[package]] +name = "rustls-pemfile" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab" +dependencies = [ + "base64 0.21.1", + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" + [[package]] name = "rustls-webpki" version = "0.100.2" @@ -4492,6 +5086,17 @@ dependencies = [ "untrusted 0.9.0", ] +[[package]] +name = "rustls-webpki" +version = "0.102.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610" +dependencies = [ + "ring 0.17.6", + "rustls-pki-types", + "untrusted 0.9.0", +] + [[package]] name = "rustversion" version = "1.0.12" @@ -4504,45 +5109,6 @@ version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" -[[package]] -name = "s3_scrubber" -version = "0.1.0" -dependencies = [ - "anyhow", - "async-stream", - "aws-config", - "aws-sdk-s3", - "aws-smithy-async", - "bincode", - "bytes", - "chrono", - "clap", - "crc32c", - "either", - "futures", - "futures-util", - "hex", - "histogram", - "itertools", - "pageserver", - "pageserver_api", - "rand 0.8.5", - "remote_storage", - "reqwest", - "serde", - "serde_json", - "serde_with", - "thiserror", - "tokio", - "tokio-rustls", - "tokio-stream", - "tracing", - "tracing-appender", - "tracing-subscriber", - "utils", - "workspace_hack", -] - [[package]] name = "safekeeper" version = "0.1.0" @@ -4558,13 +5124,14 @@ dependencies = [ "clap", "const_format", "crc32c", + "desim", "fail", "fs2", "futures", "git-version", "hex", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", @@ -4573,9 +5140,10 @@ dependencies = [ "postgres_backend", "postgres_ffi", "pq_proto", + "rand 0.8.5", "regex", "remote_storage", - "reqwest", + "reqwest 0.12.4", "safekeeper_api", "scopeguard", "sd-notify", @@ -4590,11 +5158,14 @@ dependencies = [ "tokio-io-timeout", "tokio-postgres", "tokio-stream", + "tokio-tar", "tokio-util", "toml_edit", "tracing", + "tracing-subscriber", "url", "utils", + "walproposer", "workspace_hack", ] @@ -4627,6 +5198,15 @@ dependencies = [ "windows-sys 0.42.0", ] +[[package]] +name = "scheduled-thread-pool" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19" +dependencies = [ + "parking_lot 0.12.1", +] + [[package]] name = "scopeguard" version = "1.1.0" @@ -4656,7 +5236,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" dependencies = [ "base16ct", - "der", + "der 0.6.1", "generic-array", "pkcs8", "subtle", @@ -4694,13 +5274,13 @@ checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" [[package]] name = "sentry" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b" +checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02" dependencies = [ "httpdate", - "reqwest", - "rustls", + "reqwest 0.12.4", + "rustls 0.21.11", "sentry-backtrace", "sentry-contexts", "sentry-core", @@ -4713,9 +5293,9 @@ dependencies = [ [[package]] name = "sentry-backtrace" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9" +checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e" dependencies = [ "backtrace", "once_cell", @@ -4725,11 +5305,11 @@ dependencies = [ [[package]] name = "sentry-contexts" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a" +checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a" dependencies = [ - "hostname", + "hostname 0.4.0", "libc", "os_info", "rustc_version", @@ -4739,9 +5319,9 @@ dependencies = [ [[package]] name = "sentry-core" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055" +checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826" dependencies = [ "once_cell", "rand 0.8.5", @@ -4752,9 +5332,9 @@ dependencies = [ [[package]] name = "sentry-panic" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7" +checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d" dependencies = [ "sentry-backtrace", "sentry-core", @@ -4762,9 +5342,9 @@ dependencies = [ [[package]] name = "sentry-tracing" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3" +checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe" dependencies = [ "sentry-backtrace", "sentry-core", @@ -4774,13 +5354,13 @@ dependencies = [ [[package]] name = "sentry-types" -version = "0.31.6" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd" +checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c" dependencies = [ "debugid", - "getrandom 0.2.11", "hex", + "rand 0.8.5", "serde", "serde_json", "thiserror", @@ -4822,7 +5402,7 @@ checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -4887,7 +5467,7 @@ dependencies = [ "base64 0.13.1", "chrono", "hex", - "indexmap", + "indexmap 1.9.3", "serde", "serde_json", "serde_with_macros", @@ -4903,7 +5483,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -4918,14 +5498,30 @@ dependencies = [ ] [[package]] -name = "sha2" -version = "0.10.6" +name = "sha1_smol" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" dependencies = [ "cfg-if", "cpufeatures", "digest", + "sha2-asm", +] + +[[package]] +name = "sha2-asm" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f27ba7066011e3fb30d808b51affff34f0a66d3a03a58edd787c6e420e40e44e" +dependencies = [ + "cc", ] [[package]] @@ -4939,9 +5535,9 @@ dependencies = [ [[package]] name = "shlex" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook" @@ -4983,6 +5579,15 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "rand_core 0.6.4", +] + [[package]] name = "simple_asn1" version = "0.6.2" @@ -5012,15 +5617,15 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "smol_str" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c" +checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49" dependencies = [ "serde", ] @@ -5067,7 +5672,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" dependencies = [ "base64ct", - "der", + "der 0.6.1", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der 0.7.8", ] [[package]] @@ -5096,20 +5711,132 @@ dependencies = [ "futures-util", "git-version", "humantime", - "hyper", + "hyper 0.14.26", "metrics", "once_cell", "parking_lot 0.12.1", "prost", "tokio", "tokio-stream", - "tonic 0.9.2", + "tonic", "tonic-build", "tracing", "utils", "workspace_hack", ] +[[package]] +name = "storage_controller" +version = "0.1.0" +dependencies = [ + "anyhow", + "aws-config", + "bytes", + "camino", + "clap", + "control_plane", + "diesel", + "diesel_migrations", + "fail", + "futures", + "git-version", + "hex", + "humantime", + "hyper 0.14.26", + "itertools", + "lasso", + "measured", + "metrics", + "once_cell", + "pageserver_api", + "pageserver_client", + "postgres_connection", + "r2d2", + "reqwest 0.12.4", + "routerify", + "scopeguard", + "serde", + "serde_json", + "strum", + "strum_macros", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "utils", + "workspace_hack", +] + +[[package]] +name = "storage_scrubber" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-stream", + "aws-config", + "aws-sdk-s3", + "aws-smithy-async", + "bincode", + "bytes", + "camino", + "chrono", + "clap", + "crc32c", + "either", + "futures", + "futures-util", + "hex", + "histogram", + "humantime", + "itertools", + "once_cell", + "pageserver", + "pageserver_api", + "postgres_ffi", + "rand 0.8.5", + "remote_storage", + "reqwest 0.12.4", + "rustls 0.22.4", + "rustls-native-certs 0.7.0", + "serde", + "serde_json", + "serde_with", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-postgres-rustls", + "tokio-rustls 0.25.0", + "tokio-stream", + "tokio-util", + "tracing", + "tracing-appender", + "tracing-subscriber", + "utils", + "workspace_hack", +] + +[[package]] +name = "storcon_cli" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "comfy-table", + "futures", + "humantime", + "hyper 0.14.26", + "pageserver_api", + "pageserver_client", + "reqwest 0.12.4", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "stringprep" version = "0.1.2" @@ -5138,7 +5865,7 @@ version = "0.24.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -5153,9 +5880,8 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "svg_fmt" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2" +version = "0.4.2" +source = "git+https://github.com/nical/rust_debug?rev=28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4#28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" [[package]] name = "syn" @@ -5170,9 +5896,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.32" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -5184,6 +5910,9 @@ name = "sync_wrapper" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +dependencies = [ + "futures-core", +] [[package]] name = "synstructure" @@ -5266,43 +5995,43 @@ dependencies = [ [[package]] name = "test-context" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3" +checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9" dependencies = [ - "async-trait", "futures", "test-context-macros", ] [[package]] name = "test-context-macros" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" +checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ + "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] name = "thiserror" -version = "1.0.40" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" +checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.40" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" +checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5323,7 +6052,38 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", "integer-encoding", - "ordered-float", + "ordered-float 2.10.1", +] + +[[package]] +name = "tikv-jemalloc-ctl" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c" +dependencies = [ + "libc", + "paste", + "tikv-jemalloc-sys", +] + +[[package]] +name = "tikv-jemalloc-sys" +version = "0.5.4+5.3.0-patched" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca" +dependencies = [ + "libc", + "tikv-jemalloc-sys", ] [[package]] @@ -5334,8 +6094,6 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc" dependencies = [ "itoa", "js-sys", - "libc", - "num_threads", "serde", "time-core", "time-macros", @@ -5390,25 +6148,11 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tls-listener" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81294c017957a1a69794f506723519255879e15a870507faf45dfed288b763dd" -dependencies = [ - "futures-util", - "hyper", - "pin-project-lite", - "thiserror", - "tokio", - "tokio-rustls", -] - [[package]] name = "tokio" -version = "1.34.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", @@ -5422,6 +6166,22 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "tokio-epoll-uring" +version = "0.1.0" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6" +dependencies = [ + "futures", + "nix 0.26.4", + "once_cell", + "scopeguard", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "uring-common", +] + [[package]] name = "tokio-io-timeout" version = "1.2.0" @@ -5440,23 +6200,13 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", -] - -[[package]] -name = "tokio-native-tls" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" -dependencies = [ - "native-tls", - "tokio", + "syn 2.0.52", ] [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2" dependencies = [ "async-trait", "byteorder", @@ -5478,16 +6228,17 @@ dependencies = [ [[package]] name = "tokio-postgres-rustls" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd5831152cb0d3f79ef5523b357319ba154795d64c7078b2daa95a803b54057f" +checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677" dependencies = [ "futures", - "ring 0.16.20", - "rustls", + "ring 0.17.6", + "rustls 0.22.4", "tokio", "tokio-postgres", - "tokio-rustls", + "tokio-rustls 0.25.0", + "x509-certificate", ] [[package]] @@ -5496,7 +6247,18 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5" dependencies = [ - "rustls", + "rustls 0.21.11", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f" +dependencies = [ + "rustls 0.22.4", + "rustls-pki-types", "tokio", ] @@ -5549,7 +6311,7 @@ dependencies = [ "futures-io", "futures-sink", "futures-util", - "hashbrown 0.14.0", + "hashbrown 0.14.5", "pin-project-lite", "tokio", "tracing", @@ -5582,45 +6344,13 @@ version = "0.19.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739" dependencies = [ - "indexmap", + "indexmap 1.9.3", "serde", "serde_spanned", "toml_datetime", "winnow", ] -[[package]] -name = "tonic" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb" -dependencies = [ - "async-stream", - "async-trait", - "axum", - "base64 0.13.1", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "hyper", - "hyper-timeout", - "percent-encoding", - "pin-project", - "prost", - "prost-derive", - "tokio", - "tokio-stream", - "tokio-util", - "tower", - "tower-layer", - "tower-service", - "tracing", - "tracing-futures", -] - [[package]] name = "tonic" version = "0.9.2" @@ -5634,18 +6364,18 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "h2", - "http", - "http-body", - "hyper", + "h2 0.3.26", + "http 0.2.9", + "http-body 0.4.5", + "hyper 0.14.26", "hyper-timeout", "percent-encoding", "pin-project", "prost", - "rustls-native-certs", - "rustls-pemfile", + "rustls-native-certs 0.6.2", + "rustls-pemfile 1.0.2", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-stream", "tower", "tower-layer", @@ -5674,7 +6404,7 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", - "indexmap", + "indexmap 1.9.3", "pin-project", "pin-project-lite", "rand 0.8.5", @@ -5741,7 +6471,7 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] @@ -5764,16 +6494,6 @@ dependencies = [ "tracing-subscriber", ] -[[package]] -name = "tracing-futures" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" -dependencies = [ - "pin-project", - "tracing", -] - [[package]] name = "tracing-log" version = "0.1.3" @@ -5787,12 +6507,14 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.19.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600" +checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8" dependencies = [ "once_cell", "opentelemetry", + "opentelemetry_sdk", + "smallvec", "tracing", "tracing-core", "tracing-log", @@ -5816,6 +6538,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" dependencies = [ "matchers", + "nu-ansi-term", "once_cell", "regex", "serde", @@ -5833,11 +6556,11 @@ dependencies = [ name = "tracing-utils" version = "0.1.0" dependencies = [ - "hyper", + "hyper 0.14.26", "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", - "reqwest", + "reqwest 0.12.4", "tokio", "tracing", "tracing-opentelemetry", @@ -5860,7 +6583,7 @@ dependencies = [ "byteorder", "bytes", "data-encoding", - "http", + "http 0.2.9", "httparse", "log", "rand 0.8.5", @@ -5886,15 +6609,6 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" -[[package]] -name = "tz-rs" -version = "0.6.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4" -dependencies = [ - "const_fn", -] - [[package]] name = "uname" version = "0.1.1" @@ -5904,15 +6618,6 @@ dependencies = [ "libc", ] -[[package]] -name = "unicase" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" -dependencies = [ - "version_check", -] - [[package]] name = "unicode-bidi" version = "0.3.13" @@ -5967,12 +6672,23 @@ dependencies = [ "base64 0.21.1", "log", "once_cell", - "rustls", + "rustls 0.21.11", "rustls-webpki 0.100.2", "url", "webpki-roots 0.23.1", ] +[[package]] +name = "uring-common" +version = "0.1.0" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6" +dependencies = [ + "bytes", + "io-uring", + "libc", + "linux-raw-sys 0.6.4", +] + [[package]] name = "url" version = "2.3.1" @@ -6009,6 +6725,7 @@ version = "0.1.0" dependencies = [ "anyhow", "arc-swap", + "async-compression", "async-trait", "bincode", "byteorder", @@ -6023,10 +6740,12 @@ dependencies = [ "heapless", "hex", "hex-literal", - "hyper", + "humantime", + "hyper 0.14.26", "jsonwebtoken", + "leaky-bucket", "metrics", - "nix 0.26.2", + "nix 0.27.1", "once_cell", "pin-project-lite", "postgres_connection", @@ -6046,12 +6765,14 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", + "tokio-tar", "tokio-util", "tracing", "tracing-error", "tracing-subscriber", "url", "uuid", + "walkdir", "workspace_hack", ] @@ -6179,9 +6900,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -6189,24 +6910,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.36" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" +checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" dependencies = [ "cfg-if", "js-sys", @@ -6216,9 +6937,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6226,22 +6947,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.86" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "wasm-streams" @@ -6256,6 +6977,19 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasm-streams" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasm-timer" version = "0.2.5" @@ -6273,9 +7007,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.63" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", @@ -6296,6 +7030,15 @@ version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" +[[package]] +name = "webpki-roots" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "which" version = "4.4.0" @@ -6347,6 +7090,25 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core", + "windows-targets 0.52.4", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-sys" version = "0.42.0" @@ -6380,6 +7142,15 @@ dependencies = [ "windows-targets 0.48.0", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -6410,6 +7181,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -6422,6 +7208,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -6434,6 +7226,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -6446,6 +7244,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -6458,6 +7262,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -6470,6 +7280,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -6482,6 +7298,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -6494,6 +7316,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" + [[package]] name = "winnow" version = "0.4.6" @@ -6513,17 +7341,27 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "workspace_hack" version = "0.1.0" dependencies = [ + "ahash", "anyhow", "aws-config", "aws-runtime", "aws-sigv4", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-runtime-api", "aws-smithy-types", "axum", "base64 0.21.1", @@ -6534,10 +7372,8 @@ dependencies = [ "clap", "clap_builder", "crossbeam-utils", - "dashmap", "either", "fail", - "futures", "futures-channel", "futures-core", "futures-executor", @@ -6545,9 +7381,11 @@ dependencies = [ "futures-sink", "futures-util", "getrandom 0.2.11", + "hashbrown 0.14.5", "hex", "hmac", - "hyper", + "hyper 0.14.26", + "indexmap 1.9.3", "itertools", "libc", "log", @@ -6563,34 +7401,56 @@ dependencies = [ "regex", "regex-automata 0.4.3", "regex-syntax 0.8.2", - "reqwest", - "ring 0.16.20", - "rustls", + "reqwest 0.11.19", + "reqwest 0.12.4", + "rustls 0.21.11", "scopeguard", "serde", "serde_json", + "sha2", "smallvec", "subtle", "syn 1.0.109", - "syn 2.0.32", + "syn 2.0.52", + "sync_wrapper", "time", "time-macros", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.0", "tokio-util", "toml_datetime", "toml_edit", + "tonic", "tower", "tracing", "tracing-core", - "tungstenite", "url", "uuid", + "zeroize", "zstd", "zstd-safe", "zstd-sys", ] +[[package]] +name = "x509-certificate" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66534846dec7a11d7c50a74b7cdb208b9a581cad890b7866430d438455847c85" +dependencies = [ + "bcder", + "bytes", + "chrono", + "der 0.7.8", + "hex", + "pem", + "ring 0.17.6", + "signature 2.2.0", + "spki 0.7.3", + "thiserror", + "zeroize", +] + [[package]] name = "x509-parser" version = "0.15.0" @@ -6649,14 +7509,28 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.52", ] [[package]] name = "zeroize" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" +checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index e9172809d7..8fddaaef12 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,20 +3,24 @@ resolver = "2" members = [ "compute_tools", "control_plane", + "control_plane/storcon_cli", "pageserver", + "pageserver/compaction", "pageserver/ctl", "pageserver/client", "pageserver/pagebench", "proxy", "safekeeper", "storage_broker", - "s3_scrubber", + "storage_controller", + "storage_scrubber", "workspace_hack", "trace", "libs/compute_api", "libs/pageserver_api", "libs/postgres_ffi", "libs/safekeeper_api", + "libs/desim", "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", @@ -37,21 +41,26 @@ license = "Apache-2.0" ## All dependency versions, used in the project [workspace.dependencies] +ahash = "0.8" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } -azure_core = "0.18" -azure_identity = "0.18" -azure_storage = "0.18" -azure_storage_blobs = "0.18" +atomic-take = "1.1.0" +azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] } +azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] } +azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] } +azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] } flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "1.0", default-features = false, features=["rustls"] } -aws-sdk-s3 = "1.0" -aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] } -aws-smithy-types = "1.0" -aws-credential-types = "1.0" +aws-config = { version = "1.3", default-features = false, features=["rustls"] } +aws-sdk-s3 = "1.26" +aws-sdk-iam = "1.15.0" +aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } +aws-smithy-types = "1.1.9" +aws-credential-types = "1.2.0" +aws-sigv4 = { version = "1.2.1", features = ["sign-http"] } +aws-types = "1.2.0" axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" @@ -63,72 +72,81 @@ camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } clap = { version = "4.0", features = ["derive"] } -close_fds = "0.3.2" comfy-table = "6.1" const_format = "0.2" crc32c = "0.6" +crossbeam-deque = "0.8.5" crossbeam-utils = "0.8.5" dashmap = { version = "5.5.0", features = ["raw-api"] } either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" fail = "0.5.0" +fallible-iterator = "0.2" +framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" } fs2 = "0.4.3" futures = "0.3" futures-core = "0.3" futures-util = "0.3" git-version = "0.3" -hashbrown = "0.13" -hashlink = "0.8.1" +hashbrown = "0.14" +hashlink = "0.9.1" hdrhistogram = "7.5.2" hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" hostname = "0.3.1" +http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" -hyper-tungstenite = "0.11" +tokio-tungstenite = "0.20.0" +indexmap = "2" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" jsonwebtoken = "9" +lasso = "0.7" +leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" +measured = { version = "0.0.21", features=["lasso"] } +measured-process = { version = "0.0.21" } memoffset = "0.8" -native-tls = "0.2" -nix = "0.26" -notify = "5.0.0" +nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } +notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.19.0" -opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.11.0" +opentelemetry = "0.20.0" +opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.12.0" parking_lot = "0.12" -parquet = { version = "49.0.0", default-features = false, features = ["zstd"] } -parquet_derive = "49.0.0" +parquet = { version = "51.0.0", default-features = false, features = ["zstd"] } +parquet_derive = "51.0.0" pbkdf2 = { version = "0.12.1", features = ["simple", "std"] } pin-project-lite = "0.2" -prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency +procfs = "0.14" +prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.11" rand = "0.8" +redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" -reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] } -reqwest-middleware = "0.2.0" -reqwest-retry = "0.2.2" +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] } +reqwest-middleware = "0.3.0" +reqwest-retry = "0.5" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" -rustls = "0.21" -rustls-pemfile = "1" +rustls = "0.22" +rustls-pemfile = "2" rustls-split = "0.3" scopeguard = "1.1" sysinfo = "0.29.2" sd-notify = "0.4.1" -sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } +sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" serde_path_to_error = "0.1" @@ -141,31 +159,38 @@ smol_str = { version = "0.2.0", features = ["serde"] } socket2 = "0.5" strum = "0.24" strum_macros = "0.24" -svg_fmt = "0.4.1" +"subtle" = "2.5.0" +# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet +svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" } sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" -test-context = "0.1" +test-context = "0.3" thiserror = "1.0" -tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] } +tikv-jemallocator = "0.5" +tikv-jemalloc-ctl = "0.5" tokio = { version = "1.17", features = ["macros"] } +tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } tokio-io-timeout = "1.2.0" -tokio-postgres-rustls = "0.10.0" -tokio-rustls = "0.24" +tokio-postgres-rustls = "0.11.0" +tokio-rustls = "0.25" tokio-stream = "0.1" tokio-tar = "0.3" tokio-util = { version = "0.7.10", features = ["io", "rt"] } toml = "0.7" toml_edit = "0.19" tonic = {version = "0.9", features = ["tls", "tls-roots"]} +tower-service = "0.3.2" tracing = "0.1" tracing-error = "0.2.0" -tracing-opentelemetry = "0.19.0" -tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } +tracing-opentelemetry = "0.21.0" +tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] } +twox-hash = { version = "1.6.3", default-features = false } url = "2.2" +urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" -webpki-roots = "0.25" +rustls-native-certs = "0.7" x509-parser = "0.15" ## TODO replace this with tracing @@ -174,7 +199,6 @@ log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } -postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } @@ -188,12 +212,14 @@ consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } +pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } remote_storage = { version = "0.1", path = "./libs/remote_storage/" } safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" } +desim = { version = "0.1", path = "./libs/desim" } storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy. tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" } tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } @@ -206,20 +232,19 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" } ## Build dependencies criterion = "0.5.1" -rcgen = "0.11" +rcgen = "0.12" rstest = "0.18" camino-tempfile = "1.0.2" tonic-build = "0.9" [patch.crates-io] -# This is only needed for proxy's tests. -# TODO: we should probably fork `tokio-postgres-rustls` instead. +# Needed to get `tokio-postgres-rustls` to depend on our fork. tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" } # bug fixes for UUID -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } -parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" } +parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" } ################# Binary contents sections diff --git a/Dockerfile b/Dockerfile index 5d5fde4f14..b4900d4a94 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,12 +47,13 @@ COPY --chown=nonroot . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ - && mold -run cargo build \ + && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ --bin safekeeper \ --bin storage_broker \ + --bin storage_controller \ --bin proxy \ --bin neon_local \ --locked --release \ @@ -68,8 +69,6 @@ RUN set -e \ && apt install -y \ libreadline-dev \ libseccomp-dev \ - libicu67 \ - openssl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ && useradd -d /data neon \ @@ -80,6 +79,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin @@ -98,6 +98,11 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \ -c "listen_pg_addr='0.0.0.0:6400'" \ -c "listen_http_addr='0.0.0.0:9898'" +# When running a binary that links with libpq, default to using our most recent postgres version. Binaries +# that want a particular postgres version will select it explicitly: this is just a default. +ENV LD_LIBRARY_PATH /usr/local/v16/lib + + VOLUME ["/data"] USER neon EXPOSE 6400 diff --git a/Dockerfile.buildtools b/Dockerfile.build-tools similarity index 70% rename from Dockerfile.buildtools rename to Dockerfile.build-tools index 213aed1679..5dd2c13c0e 100644 --- a/Dockerfile.buildtools +++ b/Dockerfile.build-tools @@ -58,8 +58,14 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$ && mv protoc/include/google /usr/local/include/google \ && rm -rf protoc.zip protoc +# s5cmd +ENV S5CMD_VERSION=2.2.2 +RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \ + && chmod +x s5cmd \ + && mv s5cmd /usr/local/bin/s5cmd + # LLVM -ENV LLVM_VERSION=17 +ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ @@ -81,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws && rm awscliv2.zip # Mold: A Modern Linker -ENV MOLD_VERSION v2.4.0 +ENV MOLD_VERSION v2.31.0 RUN set -e \ && git clone https://github.com/rui314/mold.git \ && mkdir mold/build \ @@ -106,12 +112,51 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS && make install \ && rm -rf ../lcov.tar.gz +# Compile and install the static OpenSSL library +ENV OPENSSL_VERSION=3.2.2 +ENV OPENSSL_PREFIX=/usr/local/openssl +RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \ + echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \ + cd /tmp && \ + tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ + rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ + cd /tmp/openssl-${OPENSSL_VERSION} && \ + ./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \ + make -j "$(nproc)" && \ + make install && \ + cd /tmp && \ + rm -rf /tmp/openssl-${OPENSSL_VERSION} + +# Use the same version of libicu as the compute nodes so that +# clusters created using inidb on pageserver can be used by computes. +# +# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu +# package, which is 67.1. We're duplicating that knowledge here, and also, technically, +# Debian has a few patches on top of 67.1 that we're not adding here. +ENV ICU_VERSION=67.1 +ENV ICU_PREFIX=/usr/local/icu + +# Download and build static ICU +RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \ + echo "94a80cd6f251a53bd2a997f6f1b5ac6653fe791dfab66e1eb0227740fb86d5dc /tmp/libicu-${ICU_VERSION}.tgz" | sha256sum --check && \ + mkdir /tmp/icu && \ + pushd /tmp/icu && \ + tar -xzf /tmp/libicu-${ICU_VERSION}.tgz && \ + pushd icu/source && \ + ./configure --prefix=${ICU_PREFIX} --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \ + make -j "$(nproc)" && \ + make install && \ + popd && \ + rm -rf icu && \ + rm -f /tmp/libicu-${ICU_VERSION}.tgz && \ + popd + # Switch to nonroot user USER nonroot:nonroot WORKDIR /home/nonroot # Python -ENV PYTHON_VERSION=3.9.2 \ +ENV PYTHON_VERSION=3.9.18 \ PYENV_ROOT=/home/nonroot/.pyenv \ PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH RUN set -e \ @@ -135,7 +180,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.75.0 +ENV RUSTC_VERSION=1.79.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ @@ -149,7 +194,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux cargo install --git https://github.com/paritytech/cachepot && \ cargo install rustfilt && \ cargo install cargo-hakari && \ - cargo install cargo-deny && \ + cargo install cargo-deny --locked && \ cargo install cargo-hack && \ cargo install cargo-nextest && \ rm -rf /home/nonroot/.cargo/registry && \ @@ -164,3 +209,6 @@ RUN whoami \ && rustup --version --verbose \ && rustc --version --verbose \ && clang --version + +# Set following flag to check in Makefile if its running in Docker +RUN touch /home/nonroot/.docker_build diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 14ba1b5b9a..3a73ac71b0 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -52,7 +52,7 @@ RUN cd postgres && \ # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser. # In vanilla postgres this function is limited to Postgres role superuser. # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases. - # We could add the additional grant statements to the postgres repository but it would be hard to maintain, + # We could add the additional grant statements to the postgres repository but it would be hard to maintain, # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork, # so we do it here. old_list="pg_stat_statements--1.0--1.1.sql pg_stat_statements--1.1--1.2.sql pg_stat_statements--1.2--1.3.sql pg_stat_statements--1.3--1.4.sql pg_stat_statements--1.4--1.5.sql pg_stat_statements--1.4.sql pg_stat_statements--1.5--1.6.sql"; \ @@ -63,14 +63,14 @@ RUN cd postgres && \ echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \ fi; \ done; \ - # the second loop is for pg_stat_statement extension versions >= 1.7, + # the second loop is for pg_stat_statement extension versions >= 1.7, # where pg_stat_statement_reset() got 3 additional arguments for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \ filename=$(basename "$file"); \ if ! echo "$old_list" | grep -q -F "$filename"; then \ echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \ fi; \ - done + done ######################################################################################### # @@ -89,7 +89,7 @@ RUN apt update && \ # SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \ - mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ + mkdir sfcgal-src && cd sfcgal-src && tar xzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / @@ -98,7 +98,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH" RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \ echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \ - mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ + mkdir postgis-src && cd postgis-src && tar xzf ../postgis.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ @@ -124,7 +124,7 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postg RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ - mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \ + mkdir pgrouting-src && cd pgrouting-src && tar xzf ../pgrouting.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ cmake -DCMAKE_BUILD_TYPE=Release .. && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -143,29 +143,24 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti ######################################################################################### FROM build-deps AS plv8-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + RUN apt update && \ apt install -y ninja-build python3-dev libncurses5 binutils clang -RUN case "${PG_VERSION}" in \ - "v14" | "v15") \ - export PLV8_VERSION=3.1.5 \ - export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \ - ;; \ - "v16") \ - export PLV8_VERSION=3.1.8 \ - export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \ - ;; \ - *) \ - echo "Export the valid PG_VERSION variable" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \ - echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \ - mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \ +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ + echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ + mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ + # generate and copy upgrade scripts + mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ + cp upgrade/* /usr/local/pgsql/share/extension/ && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ + # don't break computes with installed old version of plv8 + cd /usr/local/pgsql/lib/ && \ + ln -s plv8-3.1.10.so plv8-3.1.5.so && \ + ln -s plv8-3.1.10.so plv8-3.1.8.so && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control @@ -199,7 +194,7 @@ RUN case "$(uname -m)" in \ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ echo "ec99f1f5974846bde64f4513cf8d2ea1b8d172d2218ab41803bf6a63532272bc h3.tar.gz" | sha256sum --check && \ - mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \ + mkdir h3-src && cd h3-src && tar xzf ../h3.tar.gz --strip-components=1 -C . && \ mkdir build && cd build && \ cmake .. -DCMAKE_BUILD_TYPE=Release && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -209,7 +204,7 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \ echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \ - mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \ + mkdir h3-pg-src && cd h3-pg-src && tar xzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -227,7 +222,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ echo "411d05beeb97e5a4abf17572bfcfbb5a68d98d1018918feff995f6ee3bb03e79 postgresql-unit.tar.gz" | sha256sum --check && \ - mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ + mkdir postgresql-unit-src && cd postgresql-unit-src && tar xzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. @@ -246,11 +241,17 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \ - echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \ - mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ - make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ - make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ +COPY patches/pgvector.patch /pgvector.patch + +# By default, pgvector Makefile uses `-march=native`. We don't want that, +# because we build the images on different machines than where we run them. +# Pass OPTFLAGS="" to remove it. +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \ + echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \ + mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ + patch -p1 < /pgvector.patch && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control ######################################################################################### @@ -265,7 +266,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ echo "cfdefb15007286f67d3d45510f04a6a7a495004be5b3aecb12cda667e774203f pgjwt.tar.gz" | sha256sum --check && \ - mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \ + mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control @@ -280,7 +281,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ - mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \ + mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control @@ -296,7 +297,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \ echo "74576b992d9277c92196dd8d816baa2cc2d8046fe102f3dcd7f3c3febed6822a pg_hashids.tar.gz" | sha256sum --check && \ - mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ + mkdir pg_hashids-src && cd pg_hashids-src && tar xzf ../pg_hashids.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control @@ -312,7 +313,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \ echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \ - mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \ + mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control @@ -328,7 +329,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ - mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \ + mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control @@ -344,7 +345,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \ echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \ - mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \ + mkdir ip4r-src && cd ip4r-src && tar xzf ../ip4r.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/ip4r.control @@ -360,7 +361,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \ echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \ - mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \ + mkdir prefix-src && cd prefix-src && tar xzf ../prefix.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control @@ -376,7 +377,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \ echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \ - mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \ + mkdir hll-src && cd hll-src && tar xzf ../hll.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control @@ -392,7 +393,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ - mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ + mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control @@ -423,7 +424,7 @@ RUN case "${PG_VERSION}" in \ apt-get install -y cmake && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ - mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ + mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \ ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \ cd build && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -461,7 +462,7 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL${PG_HINT_PLAN_VERSION}.tar.gz -O pg_hint_plan.tar.gz && \ echo "${PG_HINT_PLAN_CHECKSUM} pg_hint_plan.tar.gz" | sha256sum --check && \ - mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xvzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ + mkdir pg_hint_plan-src && cd pg_hint_plan-src && tar xzf ../pg_hint_plan.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control @@ -480,7 +481,7 @@ RUN apt-get update && \ apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \ wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \ echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \ - mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ + mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ mkdir build && cd build && \ cmake -DCMAKE_BUILD_TYPE=Release .. && \ @@ -504,7 +505,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \ echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \ - mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \ + mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control @@ -525,13 +526,12 @@ RUN apt-get update && \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ - libeigen3-dev \ - libfreetype6-dev + libeigen3-dev ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ - mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \ + mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ -D RDK_BUILD_INCHI_SUPPORT=ON \ @@ -551,6 +551,8 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar. -D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \ -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \ -D RDK_INSTALL_INTREE=OFF \ + -D RDK_INSTALL_COMIC_FONTS=OFF \ + -D RDK_BUILD_FREETYPE_SUPPORT=OFF \ -D CMAKE_BUILD_TYPE=Release \ . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ @@ -569,7 +571,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ - mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ + mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control @@ -586,7 +588,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4.tar.gz -O pg_roaringbitmap.tar.gz && \ echo "b75201efcb1c2d1b014ec4ae6a22769cc7a224e6e406a587f5784a37b6b5a2aa pg_roaringbitmap.tar.gz" | sha256sum --check && \ - mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xvzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ + mkdir pg_roaringbitmap-src && cd pg_roaringbitmap-src && tar xzf ../pg_roaringbitmap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control @@ -603,7 +605,7 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O pg_semver.tar.gz && \ echo "fbdaf7512026d62eec03fad8687c15ed509b6ba395bff140acd63d2e4fbe25d7 pg_semver.tar.gz" | sha256sum --check && \ - mkdir pg_semver-src && cd pg_semver-src && tar xvzf ../pg_semver.tar.gz --strip-components=1 -C . && \ + mkdir pg_semver-src && cd pg_semver-src && tar xzf ../pg_semver.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control @@ -617,6 +619,7 @@ RUN wget https://github.com/theory/pg-semver/archive/refs/tags/v0.32.1.tar.gz -O FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +ARG PG_VERSION ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ "v14" | "v15") \ @@ -628,7 +631,7 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \ echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \ - mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ + mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install @@ -642,9 +645,9 @@ FROM build-deps AS pg-anon-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" -RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \ - echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \ - mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \ +RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \ + echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \ + mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C . && \ find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \ @@ -693,7 +696,7 @@ ARG PG_VERSION RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \ echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \ - mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ + mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control @@ -710,7 +713,7 @@ ARG PG_VERSION RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \ echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \ - mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ + mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release && \ # it's needed to enable extension because it uses untrusted C language @@ -730,7 +733,7 @@ ARG PG_VERSION # 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023 RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \ echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \ - mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ + mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control @@ -746,7 +749,7 @@ ARG PG_VERSION RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \ echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \ - mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ + mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \ echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \ wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \ @@ -768,10 +771,44 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ - mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ + mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install +######################################################################################### +# +# Layer "pg_ivm" +# compile pg_ivm extension +# +######################################################################################### +FROM build-deps AS pg-ivm-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ + echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ + mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control + +######################################################################################### +# +# Layer "pg_partman" +# compile pg_partman extension +# +######################################################################################### +FROM build-deps AS pg-partman-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin/:$PATH" +RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ + echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ + mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -779,6 +816,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar. # ######################################################################################### FROM build-deps AS neon-pg-ext-build +ARG PG_VERSION + # Public extensions COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=postgis-build /sfcgal/* / @@ -810,6 +849,9 @@ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql +COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -820,6 +862,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_utils \ -s install && \ + make -j $(getconf _NPROCESSORS_ONLN) \ + PG_CONFIG=/usr/local/pgsql/bin/pg_config \ + -C pgxn/neon_test_utils \ + -s install && \ make -j $(getconf _NPROCESSORS_ONLN) \ PG_CONFIG=/usr/local/pgsql/bin/pg_config \ -C pgxn/neon_rmgr \ @@ -851,7 +897,17 @@ ENV BUILD_TAG=$BUILD_TAG USER nonroot # Copy entire project to get Cargo.* files with proper dependencies for the whole project COPY --chown=nonroot . . -RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto +RUN cd compute_tools && mold -run cargo build --locked --profile release-line-debug-size-lto + +######################################################################################### +# +# Final compute-tools image +# +######################################################################################### + +FROM debian:bullseye-slim AS compute-tools-image + +COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl ######################################################################################### # @@ -872,6 +928,69 @@ RUN rm -r /usr/local/pgsql/include # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a + +######################################################################################### +# +# Layer neon-pg-ext-test +# +######################################################################################### + +FROM neon-pg-ext-build AS neon-pg-ext-test +ARG PG_VERSION +RUN mkdir /ext-src + +#COPY --from=postgis-build /postgis.tar.gz /ext-src/ +#COPY --from=postgis-build /sfcgal/* /usr +COPY --from=plv8-build /plv8.tar.gz /ext-src/ +COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/ +COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/ +COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/ +COPY --from=vector-pg-build /pgvector.patch /ext-src/ +COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src +#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src +#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src +#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src +COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src +COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src +#COPY --from=rum-pg-build /rum.tar.gz /ext-src +#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src +COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src +COPY --from=prefix-pg-build /prefix.tar.gz /ext-src +COPY --from=hll-pg-build /hll.tar.gz /ext-src +COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src +#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src +COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src +COPY patches/pg_hintplan.patch /ext-src +#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src +COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src +COPY patches/pg_cron.patch /ext-src +#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src +COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src +COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src +COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src +COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src +#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src +#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src +COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src +COPY patches/pg_anon.patch /ext-src +COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src +COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src +RUN cd /ext-src/ && for f in *.tar.gz; \ + do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ + rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ + || exit 1; rm -f $f; done +RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch +# cmake is required for the h3 test +RUN apt-get update && apt-get install -y cmake +RUN patch -p1 < /ext-src/pg_hintplan.patch +COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh +RUN patch -p1 > /etc/ld.so.conf && /sbin/ldconfig && \ # create folder for file cache mkdir -p -m 777 /neon/cache @@ -892,6 +1013,9 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl +# Create remote extension download directory +RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions + # Install: # libreadline8 for psql # libicu67, locales for collations (including ICU and plpgsql_check) @@ -900,7 +1024,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS # libxml2, libxslt1.1 for xml2 # libzstd1 for zstd -# libboost*, libfreetype6, and zlib1g for rdkit +# libboost* for rdkit # ca-certificates for communicating with s3 by compute_ctl RUN apt update && \ apt install --no-install-recommends -y \ @@ -913,7 +1037,6 @@ RUN apt update && \ libboost-serialization1.74.0 \ libboost-system1.74.0 \ libossp-uuid16 \ - libfreetype6 \ libgeos-c1v5 \ libgdal28 \ libproj19 \ @@ -925,7 +1048,6 @@ RUN apt update && \ libcurl4-openssl-dev \ locales \ procps \ - zlib1g \ ca-certificates && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/Dockerfile.compute-tools b/Dockerfile.compute-tools deleted file mode 100644 index cc305cc556..0000000000 --- a/Dockerfile.compute-tools +++ /dev/null @@ -1,32 +0,0 @@ -# First transient image to build compute_tools binaries -# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml -ARG REPOSITORY=neondatabase -ARG IMAGE=build-tools -ARG TAG=pinned -ARG BUILD_TAG - -FROM $REPOSITORY/$IMAGE:$TAG AS rust-build -WORKDIR /home/nonroot - -# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds. -# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations. -# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build. -ARG RUSTC_WRAPPER=cachepot -ENV AWS_REGION=eu-central-1 -ENV CACHEPOT_S3_KEY_PREFIX=cachepot -ARG CACHEPOT_BUCKET=neon-github-dev -#ARG AWS_ACCESS_KEY_ID -#ARG AWS_SECRET_ACCESS_KEY -ARG BUILD_TAG -ENV BUILD_TAG=$BUILD_TAG - -COPY . . - -RUN set -e \ - && mold -run cargo build -p compute_tools --locked --release \ - && cachepot -s - -# Final image that only has one binary -FROM debian:bullseye-slim - -COPY --from=rust-build /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl diff --git a/Makefile b/Makefile index 004ca3fbcf..942867d81a 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,9 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # Where to install Postgres, default is ./pg_install, maybe useful for package managers POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ +OPENSSL_PREFIX_DIR := /usr/local/openssl +ICU_PREFIX_DIR := /usr/local/icu + # # We differentiate between release / debug build types using the BUILD_TYPE # environment variable. @@ -20,19 +23,31 @@ else $(error Bad build type '$(BUILD_TYPE)', see Makefile for options) endif +ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes) + # Exclude static build openssl, icu for local build (MacOS, Linux) + # Only keep for build type release and debug + PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include + PG_CONFIGURE_OPTS += --with-icu + PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION' + PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm' + PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread' +endif + UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Linux) # Seccomp BPF is only available for Linux PG_CONFIGURE_OPTS += --with-libseccomp else ifeq ($(UNAME_S),Darwin) - # macOS with brew-installed openssl requires explicit paths - # It can be configured with OPENSSL_PREFIX variable - OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3) - PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib - PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig - # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure - # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage - EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/: + ifndef DISABLE_HOMEBREW + # macOS with brew-installed openssl requires explicit paths + # It can be configured with OPENSSL_PREFIX variable + OPENSSL_PREFIX := $(shell brew --prefix openssl@3) + PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib + PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig + # macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure + # brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage + EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/: + endif endif # Use -C option so that when PostgreSQL "make install" installs the @@ -51,6 +66,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS)) CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+) # Force cargo not to print progress bar CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1 +# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel) +CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib # # Top level Makefile to build Neon and PostgreSQL @@ -77,11 +94,14 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status: echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ exit 1; } mkdir -p $(POSTGRES_INSTALL_DIR)/build/$* - (cd $(POSTGRES_INSTALL_DIR)/build/$* && \ - env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \ + + VERSION=$*; \ + EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ + (cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \ + env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ CFLAGS='$(PG_CFLAGS)' \ - $(PG_CONFIGURE_OPTS) \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log) + $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) # nicer alias to run 'configure' # Note: I've been unable to use templates for this part of our configuration. @@ -117,6 +137,8 @@ postgres-%: postgres-configure-% \ $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install +@echo "Compiling amcheck $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install + +@echo "Compiling test_decoding $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install .PHONY: postgres-clean-% postgres-clean-%: @@ -157,8 +179,8 @@ neon-pg-ext-%: postgres-% -C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install -.PHONY: neon-pg-ext-clean-% -neon-pg-ext-clean-%: +.PHONY: neon-pg-clean-ext-% +neon-pg-clean-ext-%: $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean @@ -174,10 +196,10 @@ neon-pg-ext-clean-%: # Build walproposer as a static library. walproposer source code is located # in the pgxn/neon directory. -# +# # We also need to include libpgport.a and libpgcommon.a, because walproposer # uses some functions from those libraries. -# +# # Some object files are removed from libpgport.a and libpgcommon.a because # they depend on openssl and other libraries that are not included in our # Rust build. @@ -214,11 +236,11 @@ neon-pg-ext: \ neon-pg-ext-v15 \ neon-pg-ext-v16 -.PHONY: neon-pg-ext-clean -neon-pg-ext-clean: \ - neon-pg-ext-clean-v14 \ - neon-pg-ext-clean-v15 \ - neon-pg-ext-clean-v16 +.PHONY: neon-pg-clean-ext +neon-pg-clean-ext: \ + neon-pg-clean-ext-v14 \ + neon-pg-clean-ext-v15 \ + neon-pg-clean-ext-v16 # shorthand to build all Postgres versions .PHONY: postgres @@ -247,7 +269,7 @@ postgres-check: \ # This doesn't remove the effects of 'configure'. .PHONY: clean -clean: postgres-clean neon-pg-ext-clean +clean: postgres-clean neon-pg-clean-ext $(CARGO_CMD_PREFIX) cargo clean # This removes everything diff --git a/NOTICE b/NOTICE index c13dc2f0b3..52fc751c41 100644 --- a/NOTICE +++ b/NOTICE @@ -1,5 +1,5 @@ Neon -Copyright 2022 Neon Inc. +Copyright 2022 - 2024 Neon Inc. The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license. See vendor/postgres-vX/COPYRIGHT for details. diff --git a/README.md b/README.md index 98af1edee6..ea0a289502 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ -[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech) +[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech) + + # Neon Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes. ## Quick start -Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions. +Try the [Neon Free Tier](https://neon.tech/github) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions. Alternatively, compile and run the project [locally](#running-local-installation). @@ -14,8 +16,8 @@ Alternatively, compile and run the project [locally](#running-local-installation A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine. The Neon storage engine consists of two major components: -- Pageserver. Scalable storage backend for the compute nodes. -- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. +- Pageserver: Scalable storage backend for the compute nodes. +- Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage. See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information. @@ -81,9 +83,9 @@ The project uses [rust toolchain file](./rust-toolchain.toml) to define the vers This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file. -rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. +rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory. -non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file. +non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file. Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates. #### Building on Linux @@ -124,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively. To run the integration tests or Python scripts (not required to use the code), install -Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory. +Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory. #### Running neon database @@ -166,7 +168,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres' 2. Now, it is possible to connect to postgres and run some queries: ```text -> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres +> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE postgres=# insert into t values(1,1); @@ -205,7 +207,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres' # this new postgres instance will have all the data from 'main' postgres, # but all modifications would not affect data in original postgres -> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres +> psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -216,7 +218,7 @@ postgres=# insert into t values(2,2); INSERT 0 1 # check that the new change doesn't affect the 'main' postgres -> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres +> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres postgres=# select * from t; key | value -----+------- @@ -224,14 +226,28 @@ postgres=# select * from t; (1 row) ``` -4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances +4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances you have just started. You can terminate them all with one command: ```sh > cargo neon stop ``` +More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md). + +#### Handling build failures + +If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again. + ## Running tests +### Rust unit tests + +We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows. +Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead. +You can install `cargo-nextest` with `cargo install cargo-nextest`. + +### Integration tests + Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes). ```sh @@ -243,12 +259,28 @@ CARGO_BUILD_FLAGS="--features=testing" make ``` By default, this runs both debug and release modes, and all supported postgres versions. When -testing locally, it is convenient to run just run one set of permutations, like this: +testing locally, it is convenient to run just one set of permutations, like this: ```sh DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest ``` +## Flamegraphs + +You may find yourself in need of flamegraphs for software in this repository. +You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice! + +>[!IMPORTANT] +> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument. +> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository. +> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764). + +## Cleanup + +For cleaning up the source tree from build artifacts, run `make clean` in the source directory. + +For removing every artifact from build and configure steps, run `make distclean`, and also consider removing the cargo binaries in the `target` directory, as well as the database in the `.neon` directory. Note that removing the `.neon` directory will remove your database, with all data in it. You have been warned! + ## Documentation [docs](/docs) Contains a top-level overview of all available markdown documentation. diff --git a/clippy.toml b/clippy.toml index d788afc84d..4c0c04f9a1 100644 --- a/clippy.toml +++ b/clippy.toml @@ -2,4 +2,13 @@ disallowed-methods = [ "tokio::task::block_in_place", # Allow this for now, to deny it later once we stop using Handle::block_on completely # "tokio::runtime::Handle::block_on", + # use tokio_epoll_uring_ext instead + "tokio_epoll_uring::thread_local_system", +] + +disallowed-macros = [ + # use std::pin::pin + "futures::pin_mut", + # cannot disallow this, because clippy finds used from tokio macros + #"tokio::pin", ] diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 759a117ee9..8f96530a9d 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -27,10 +27,12 @@ reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true +tokio-stream.workspace = true tracing.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true +thiserror.workspace = true url.workspace = true compute_api.workspace = true diff --git a/compute_tools/README.md b/compute_tools/README.md index 22a7de7cb7..8d84031efc 100644 --- a/compute_tools/README.md +++ b/compute_tools/README.md @@ -32,6 +32,29 @@ compute_ctl -D /var/db/postgres/compute \ -b /usr/local/bin/postgres ``` +## State Diagram + +Computes can be in various states. Below is a diagram that details how a +compute moves between states. + +```mermaid +%% https://mermaid.js.org/syntax/stateDiagram.html +stateDiagram-v2 + [*] --> Empty : Compute spawned + Empty --> ConfigurationPending : Waiting for compute spec + ConfigurationPending --> Configuration : Received compute spec + Configuration --> Failed : Failed to configure the compute + Configuration --> Running : Compute has been configured + Empty --> Init : Compute spec is immediately available + Empty --> TerminationPending : Requested termination + Init --> Failed : Failed to start Postgres + Init --> Running : Started Postgres + Running --> TerminationPending : Requested termination + TerminationPending --> Terminated : Terminated compute + Failed --> [*] : Compute exited + Terminated --> [*] : Compute exited +``` + ## Tests Cargo formatter: diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index eb1d746f04..7bf5db5a57 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -32,8 +32,6 @@ //! -S /var/db/postgres/specs/current.json \ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ -//! --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable' -//! --pgbouncer-ini-path /etc/pgbouncer.ini \ //! ``` //! use std::collections::HashMap; @@ -47,15 +45,17 @@ use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; -use nix::sys::signal::{kill, Signal}; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; -use tracing::{error, info}; +use tracing::{error, info, warn}; use url::Url; use compute_api::responses::ComputeStatus; +use compute_api::spec::ComputeSpec; -use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID}; +use compute_tools::compute::{ + forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID, +}; use compute_tools::configurator::launch_configurator; use compute_tools::extension_server::get_pg_version; use compute_tools::http::api::launch_http_server; @@ -63,12 +63,41 @@ use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; +use compute_tools::swap::resize_swap; // this is an arbitrary build tag. Fine as a default / for testing purposes // in-case of not-set environment var const BUILD_TAG_DEFAULT: &str = "latest"; fn main() -> Result<()> { + let (build_tag, clap_args) = init()?; + + let (pg_handle, start_pg_result) = { + // Enter startup tracing context + let _startup_context_guard = startup_context_from_env(); + + let cli_args = process_cli(&clap_args)?; + + let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?; + + let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?; + + start_postgres(&clap_args, wait_spec_result)? + + // Startup is finished, exit the startup tracing span + }; + + // PostgreSQL is now running, if startup was successful. Wait until it exits. + let wait_pg_result = wait_postgres(pg_handle)?; + + let delay_exit = cleanup_after_postgres_exit(start_pg_result)?; + + maybe_delay_exit(delay_exit); + + deinit_and_exit(wait_pg_result); +} + +fn init() -> Result<(String, clap::ArgMatches)> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?; @@ -83,9 +112,15 @@ fn main() -> Result<()> { .to_string(); info!("build_tag: {build_tag}"); - let matches = cli().get_matches(); - let pgbin_default = String::from("postgres"); - let pgbin = matches.get_one::("pgbin").unwrap_or(&pgbin_default); + Ok((build_tag, cli().get_matches())) +} + +fn process_cli(matches: &clap::ArgMatches) -> Result { + let pgbin_default = "postgres"; + let pgbin = matches + .get_one::("pgbin") + .map(|s| s.as_str()) + .unwrap_or(pgbin_default); let ext_remote_storage = matches .get_one::("remote-ext-config") @@ -111,10 +146,32 @@ fn main() -> Result<()> { .expect("Postgres connection string is required"); let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); + let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind"); - let pgbouncer_connstr = matches.get_one::("pgbouncer-connstr"); - let pgbouncer_ini_path = matches.get_one::("pgbouncer-ini-path"); + Ok(ProcessCliResult { + connstr, + pgdata, + pgbin, + ext_remote_storage, + http_port, + spec_json, + spec_path, + resize_swap_on_bind, + }) +} +struct ProcessCliResult<'clap> { + connstr: &'clap str, + pgdata: &'clap str, + pgbin: &'clap str, + ext_remote_storage: Option<&'clap str>, + http_port: u16, + spec_json: Option<&'clap String>, + spec_path: Option<&'clap String>, + resize_swap_on_bind: bool, +} + +fn startup_context_from_env() -> Option { // Extract OpenTelemetry context for the startup actions from the // TRACEPARENT and TRACESTATE env variables, and attach it to the current // tracing context. @@ -151,7 +208,7 @@ fn main() -> Result<()> { if let Ok(val) = std::env::var("TRACESTATE") { startup_tracing_carrier.insert("tracestate".to_string(), val); } - let startup_context_guard = if !startup_tracing_carrier.is_empty() { + if !startup_tracing_carrier.is_empty() { use opentelemetry::propagation::TextMapPropagator; use opentelemetry::sdk::propagation::TraceContextPropagator; let guard = TraceContextPropagator::new() @@ -161,8 +218,17 @@ fn main() -> Result<()> { Some(guard) } else { None - }; + } +} +fn try_spec_from_cli( + matches: &clap::ArgMatches, + ProcessCliResult { + spec_json, + spec_path, + .. + }: &ProcessCliResult, +) -> Result { let compute_id = matches.get_one::("compute-id"); let control_plane_uri = matches.get_one::("control-plane-uri"); @@ -203,6 +269,34 @@ fn main() -> Result<()> { } }; + Ok(CliSpecParams { + spec, + live_config_allowed, + }) +} + +struct CliSpecParams { + /// If a spec was provided via CLI or file, the [`ComputeSpec`] + spec: Option, + live_config_allowed: bool, +} + +fn wait_spec( + build_tag: String, + ProcessCliResult { + connstr, + pgdata, + pgbin, + ext_remote_storage, + resize_swap_on_bind, + http_port, + .. + }: ProcessCliResult, + CliSpecParams { + spec, + live_config_allowed, + }: CliSpecParams, +) -> Result { let mut new_state = ComputeState::new(); let spec_set; @@ -225,26 +319,22 @@ fn main() -> Result<()> { ext_remote_storage: ext_remote_storage.map(|s| s.to_string()), ext_download_progress: RwLock::new(HashMap::new()), build_tag, - pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()), - pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()), }; let compute = Arc::new(compute_node); // If this is a pooled VM, prewarm before starting HTTP server and becoming - // available for binding. Prewarming helps postgres start quicker later, - // because QEMU will already have it's memory allocated from the host, and - // the necessary binaries will alreaady be cached. + // available for binding. Prewarming helps Postgres start quicker later, + // because QEMU will already have its memory allocated from the host, and + // the necessary binaries will already be cached. if !spec_set { compute.prewarm_postgres()?; } - // Launch http service first, so we were able to serve control-plane - // requests, while configuration is still in progress. + // Launch http service first, so that we can serve control-plane requests + // while configuration is still in progress. let _http_handle = launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); - let extension_server_port: u16 = http_port; - if !spec_set { // No spec provided, hang waiting for it. info!("no compute spec provided, waiting"); @@ -259,50 +349,118 @@ fn main() -> Result<()> { break; } } + + // Record for how long we slept waiting for the spec. + let now = Utc::now(); + state.metrics.wait_for_spec_ms = now + .signed_duration_since(state.start_time) + .to_std() + .unwrap() + .as_millis() as u64; + + // Reset start time, so that the total startup time that is calculated later will + // not include the time that we waited for the spec. + state.start_time = now; } + Ok(WaitSpecResult { + compute, + http_port, + resize_swap_on_bind, + }) +} + +struct WaitSpecResult { + compute: Arc, + // passed through from ProcessCliResult + http_port: u16, + resize_swap_on_bind: bool, +} + +fn start_postgres( + // need to allow unused because `matches` is only used if target_os = "linux" + #[allow(unused_variables)] matches: &clap::ArgMatches, + WaitSpecResult { + compute, + http_port, + resize_swap_on_bind, + }: WaitSpecResult, +) -> Result<(Option, StartPostgresResult)> { // We got all we need, update the state. let mut state = compute.state.lock().unwrap(); - - // Record for how long we slept waiting for the spec. - state.metrics.wait_for_spec_ms = Utc::now() - .signed_duration_since(state.start_time) - .to_std() - .unwrap() - .as_millis() as u64; - // Reset start time to the actual start of the configuration, so that - // total startup time was properly measured at the end. - state.start_time = Utc::now(); - state.status = ComputeStatus::Init; compute.state_changed.notify_all(); + + info!( + "running compute with features: {:?}", + state.pspec.as_ref().unwrap().spec.features + ); + // before we release the mutex, fetch the swap size (if any) for later. + let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes; drop(state); // Launch remaining service threads let _monitor_handle = launch_monitor(&compute); let _configurator_handle = launch_configurator(&compute); - // Start Postgres + let mut prestartup_failed = false; let mut delay_exit = false; - let mut exit_code = None; - let pg = match compute.start_compute(extension_server_port) { - Ok(pg) => Some(pg), - Err(err) => { - error!("could not start the compute node: {:?}", err); - let mut state = compute.state.lock().unwrap(); - state.error = Some(format!("{:?}", err)); - state.status = ComputeStatus::Failed; - // Notify others that Postgres failed to start. In case of configuring the - // empty compute, it's likely that API handler is still waiting for compute - // state change. With this we will notify it that compute is in Failed state, - // so control plane will know about it earlier and record proper error instead - // of timeout. - compute.state_changed.notify_all(); - drop(state); // unlock - delay_exit = true; - None + + // Resize swap to the desired size if the compute spec says so + if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) { + // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion + // *before* starting postgres. + // + // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this + // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets + // OOM-killed during startup because swap wasn't available yet. + match resize_swap(size_bytes) { + Ok(()) => { + let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display. + info!(%size_bytes, %size_gib, "resized swap"); + } + Err(err) => { + let err = err.context("failed to resize swap"); + error!("{err:#}"); + + // Mark compute startup as failed; don't try to start postgres, and report this + // error to the control plane when it next asks. + prestartup_failed = true; + let mut state = compute.state.lock().unwrap(); + state.error = Some(format!("{err:?}")); + state.status = ComputeStatus::Failed; + compute.state_changed.notify_all(); + delay_exit = true; + } } - }; + } + + let extension_server_port: u16 = http_port; + + // Start Postgres + let mut pg = None; + if !prestartup_failed { + pg = match compute.start_compute(extension_server_port) { + Ok(pg) => Some(pg), + Err(err) => { + error!("could not start the compute node: {:#}", err); + let mut state = compute.state.lock().unwrap(); + state.error = Some(format!("{:?}", err)); + state.status = ComputeStatus::Failed; + // Notify others that Postgres failed to start. In case of configuring the + // empty compute, it's likely that API handler is still waiting for compute + // state change. With this we will notify it that compute is in Failed state, + // so control plane will know about it earlier and record proper error instead + // of timeout. + compute.state_changed.notify_all(); + drop(state); // unlock + delay_exit = true; + None + } + }; + } else { + warn!("skipping postgres startup because pre-startup step failed"); + } // Start the vm-monitor if directed to. The vm-monitor only runs on linux // because it requires cgroups. @@ -335,7 +493,7 @@ fn main() -> Result<()> { // This token is used internally by the monitor to clean up all threads let token = CancellationToken::new(); - let vm_monitor = &rt.as_ref().map(|rt| { + let vm_monitor = rt.as_ref().map(|rt| { rt.spawn(vm_monitor::start( Box::leak(Box::new(vm_monitor::Args { cgroup: cgroup.cloned(), @@ -348,20 +506,74 @@ fn main() -> Result<()> { } } + Ok(( + pg, + StartPostgresResult { + delay_exit, + compute, + #[cfg(target_os = "linux")] + rt, + #[cfg(target_os = "linux")] + token, + #[cfg(target_os = "linux")] + vm_monitor, + }, + )) +} + +type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>); + +struct StartPostgresResult { + delay_exit: bool, + // passed through from WaitSpecResult + compute: Arc, + + #[cfg(target_os = "linux")] + rt: Option, + #[cfg(target_os = "linux")] + token: tokio_util::sync::CancellationToken, + #[cfg(target_os = "linux")] + vm_monitor: Option>>, +} + +fn wait_postgres(pg: Option) -> Result { // Wait for the child Postgres process forever. In this state Ctrl+C will // propagate to Postgres and it will be shut down as well. - if let Some(mut pg) = pg { - // Startup is finished, exit the startup tracing span - drop(startup_context_guard); - + let mut exit_code = None; + if let Some((mut pg, logs_handle)) = pg { let ecode = pg .wait() .expect("failed to start waiting on Postgres process"); PG_PID.store(0, Ordering::SeqCst); + + // Process has exited, so we can join the logs thread. + let _ = logs_handle + .join() + .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); + info!("Postgres exited with code {}, shutting down", ecode); exit_code = ecode.code() } + Ok(WaitPostgresResult { exit_code }) +} + +struct WaitPostgresResult { + exit_code: Option, +} + +fn cleanup_after_postgres_exit( + StartPostgresResult { + mut delay_exit, + compute, + #[cfg(target_os = "linux")] + vm_monitor, + #[cfg(target_os = "linux")] + token, + #[cfg(target_os = "linux")] + rt, + }: StartPostgresResult, +) -> Result { // Terminate the vm_monitor so it releases the file watcher on // /sys/fs/cgroup/neon-postgres. // Note: the vm-monitor only runs on linux because it requires cgroups. @@ -390,17 +602,32 @@ fn main() -> Result<()> { info!("synced safekeepers at lsn {lsn}"); } + let mut state = compute.state.lock().unwrap(); + if state.status == ComputeStatus::TerminationPending { + state.status = ComputeStatus::Terminated; + compute.state_changed.notify_all(); + // we were asked to terminate gracefully, don't exit to avoid restart + delay_exit = true + } + drop(state); + if let Err(err) = compute.check_for_core_dumps() { error!("error while checking for core dumps: {err:?}"); } + Ok(delay_exit) +} + +fn maybe_delay_exit(delay_exit: bool) { // If launch failed, keep serving HTTP requests for a while, so the cloud // control plane can get the actual error. if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); thread::sleep(Duration::from_secs(30)); } +} +fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may // hang for quite some time, see, for example: @@ -508,26 +735,14 @@ fn cli() -> clap::Command { Arg::new("filecache-connstr") .long("filecache-connstr") .default_value( - "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable", + "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor", ) .value_name("FILECACHE_CONNSTR"), ) .arg( - Arg::new("pgbouncer-connstr") - .long("pgbouncer-connstr") - .default_value( - "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable", - ) - .value_name("PGBOUNCER_CONNSTR"), - ) - .arg( - Arg::new("pgbouncer-ini-path") - .long("pgbouncer-ini-path") - // Note: this doesn't match current path for pgbouncer.ini. - // Until we fix it, we need to pass the path explicitly - // or this will be effectively no-op. - .default_value("/etc/pgbouncer.ini") - .value_name("PGBOUNCER_INI_PATH"), + Arg::new("resize-swap-on-bind") + .long("resize-swap-on-bind") + .action(clap::ArgAction::SetTrue), ) } @@ -536,16 +751,7 @@ fn cli() -> clap::Command { /// wait for termination which would be easy then. fn handle_exit_signal(sig: i32) { info!("received {sig} termination signal"); - let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); - if ss_pid != 0 { - let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); - kill(ss_pid, Signal::SIGTERM).ok(); - } - let pg_pid = PG_PID.load(Ordering::SeqCst); - if pg_pid != 0 { - let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); - kill(pg_pid, Signal::SIGTERM).ok(); - } + forward_termination_signal(); exit(1); } diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs new file mode 100644 index 0000000000..4fefa831e0 --- /dev/null +++ b/compute_tools/src/catalog.rs @@ -0,0 +1,116 @@ +use compute_api::{ + responses::CatalogObjects, + spec::{Database, Role}, +}; +use futures::Stream; +use postgres::{Client, NoTls}; +use std::{path::Path, process::Stdio, result::Result, sync::Arc}; +use tokio::{ + io::{AsyncBufReadExt, BufReader}, + process::Command, + task, +}; +use tokio_stream::{self as stream, StreamExt}; +use tokio_util::codec::{BytesCodec, FramedRead}; +use tracing::warn; + +use crate::{ + compute::ComputeNode, + pg_helpers::{get_existing_dbs, get_existing_roles}, +}; + +pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { + let connstr = compute.connstr.clone(); + task::spawn_blocking(move || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + let roles: Vec; + { + let mut xact = client.transaction()?; + roles = get_existing_roles(&mut xact)?; + } + let databases: Vec = get_existing_dbs(&mut client)?.values().cloned().collect(); + + Ok(CatalogObjects { roles, databases }) + }) + .await? +} + +#[derive(Debug, thiserror::Error)] +pub enum SchemaDumpError { + #[error("Database does not exist.")] + DatabaseDoesNotExist, + #[error("Failed to execute pg_dump.")] + IO(#[from] std::io::Error), +} + +// It uses the pg_dump utility to dump the schema of the specified database. +// The output is streamed back to the caller and supposed to be streamed via HTTP. +// +// Before return the result with the output, it checks that pg_dump produced any output. +// If not, it tries to parse the stderr output to determine if the database does not exist +// and special error is returned. +// +// To make sure that the process is killed when the caller drops the stream, we use tokio kill_on_drop feature. +pub async fn get_database_schema( + compute: &Arc, + dbname: &str, +) -> Result>, SchemaDumpError> { + let pgbin = &compute.pgbin; + let basepath = Path::new(pgbin).parent().unwrap(); + let pgdump = basepath.join("pg_dump"); + let mut connstr = compute.connstr.clone(); + connstr.set_path(dbname); + let mut cmd = Command::new(pgdump) + .arg("--schema-only") + .arg(connstr.as_str()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .kill_on_drop(true) + .spawn()?; + + let stdout = cmd.stdout.take().ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.") + })?; + + let stderr = cmd.stderr.take().ok_or_else(|| { + std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.") + })?; + + let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new()); + let stderr_reader = BufReader::new(stderr); + + let first_chunk = match stdout_reader.next().await { + Some(Ok(bytes)) if !bytes.is_empty() => bytes, + Some(Err(e)) => { + return Err(SchemaDumpError::IO(e)); + } + _ => { + let mut lines = stderr_reader.lines(); + if let Some(line) = lines.next_line().await? { + if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) { + return Err(SchemaDumpError::DatabaseDoesNotExist); + } + warn!("pg_dump stderr: {}", line) + } + tokio::spawn(async move { + while let Ok(Some(line)) = lines.next_line().await { + warn!("pg_dump stderr: {}", line) + } + }); + + return Err(SchemaDumpError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + "failed to start pg_dump", + ))); + } + }; + let initial_stream = stream::once(Ok(first_chunk.freeze())); + // Consume stderr and log warnings + tokio::spawn(async move { + let mut lines = stderr_reader.lines(); + while let Ok(Some(line)) = lines.next_line().await { + warn!("pg_dump stderr: {}", line) + } + }); + Ok(initial_stream.chain(stdout_reader.map(|res| res.map(|b| b.freeze())))) +} diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 13701b7378..a79b666409 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::env; use std::fs; use std::io::BufRead; -use std::os::unix::fs::PermissionsExt; +use std::os::unix::fs::{symlink, PermissionsExt}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; @@ -17,10 +17,10 @@ use chrono::{DateTime, Utc}; use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; +use nix::unistd::Pid; +use postgres::error::SqlState; use postgres::{Client, NoTls}; -use tokio; -use tokio_postgres; -use tracing::{error, info, instrument, warn}; +use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -28,9 +28,12 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus}; use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec}; use utils::measured_stream::MeasuredReader; +use nix::sys::signal::{kill, Signal}; + use remote_storage::{DownloadError, RemotePath}; use crate::checker::create_availability_check_data; +use crate::logger::inlinify; use crate::pg_helpers::*; use crate::spec::*; use crate::sync_sk::{check_if_synced, ping_safekeeper}; @@ -70,10 +73,6 @@ pub struct ComputeNode { // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, pub build_tag: String, - // connection string to pgbouncer to change settings - pub pgbouncer_connstr: Option, - // path to pgbouncer.ini to change settings - pub pgbouncer_ini_path: Option, } // store some metrics about download size that might impact startup time @@ -210,6 +209,7 @@ fn maybe_cgexec(cmd: &str) -> Command { /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser /// that we give to customers +#[instrument(skip_all)] fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> { let roles = spec .cluster @@ -279,7 +279,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> $$;"#, roles_decl, database_decl, ); - info!("Neon superuser created:\n{}", &query); + info!("Neon superuser created: {}", inlinify(&query)); client .simple_query(&query) .map_err(|e| anyhow::anyhow!(e).context(query))?; @@ -322,11 +322,12 @@ impl ComputeNode { // Get basebackup from the libpq connection to pageserver using `connstr` and // unarchive it to `pgdata` directory overriding all its previous content. #[instrument(skip_all, fields(%lsn))] - fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { + fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); let start_time = Instant::now(); - let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?; + let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); + let mut config = postgres::Config::from_str(shard0_connstr)?; // Use the storage auth token from the config file, if given. // Note: this overrides any password set in the connection string. @@ -393,6 +394,34 @@ impl ComputeNode { Ok(()) } + // Gets the basebackup in a retry loop + #[instrument(skip_all, fields(%lsn))] + pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { + let mut retry_period_ms = 500.0; + let mut attempts = 0; + let max_attempts = 10; + loop { + let result = self.try_get_basebackup(compute_state, lsn); + match result { + Ok(_) => { + return result; + } + Err(ref e) if attempts < max_attempts => { + warn!( + "Failed to get basebackup: {} (attempt {}/{})", + e, attempts, max_attempts + ); + std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64)); + retry_period_ms *= 1.5; + } + Err(_) => { + return result; + } + } + attempts += 1; + } + } + pub async fn check_safekeepers_synced_async( &self, compute_state: &ComputeState, @@ -495,7 +524,7 @@ impl ComputeNode { pub fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); - let sync_handle = maybe_cgexec(&self.pgbin) + let mut sync_handle = maybe_cgexec(&self.pgbin) .args(["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode .envs(if let Some(storage_auth_token) = &storage_auth_token { @@ -504,18 +533,30 @@ impl ComputeNode { vec![] }) .stdout(Stdio::piped()) + .stderr(Stdio::piped()) .spawn() .expect("postgres --sync-safekeepers failed to start"); SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst); // `postgres --sync-safekeepers` will print all log output to stderr and - // final LSN to stdout. So we pipe only stdout, while stderr will be automatically - // redirected to the caller output. + // final LSN to stdout. So we leave stdout to collect LSN, while stderr logs + // will be collected in a child thread. + let stderr = sync_handle + .stderr + .take() + .expect("stderr should be captured"); + let logs_handle = handle_postgres_logs(stderr); + let sync_output = sync_handle .wait_with_output() .expect("postgres --sync-safekeepers failed"); SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst); + // Process has exited, so we can join the logs thread. + let _ = logs_handle + .join() + .map_err(|e| tracing::error!("log thread panicked: {:?}", e)); + if !sync_output.status.success() { anyhow::bail!( "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}", @@ -596,6 +637,48 @@ impl ComputeNode { // Update pg_hba.conf received with basebackup. update_pg_hba(pgdata_path)?; + // Place pg_dynshmem under /dev/shm. This allows us to use + // 'dynamic_shared_memory_type = mmap' so that the files are placed in + // /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works. + // + // Why on earth don't we just stick to the 'posix' default, you might + // ask. It turns out that making large allocations with 'posix' doesn't + // work very well with autoscaling. The behavior we want is that: + // + // 1. You can make large DSM allocations, larger than the current RAM + // size of the VM, without errors + // + // 2. If the allocated memory is really used, the VM is scaled up + // automatically to accommodate that + // + // We try to make that possible by having swap in the VM. But with the + // default 'posix' DSM implementation, we fail step 1, even when there's + // plenty of swap available. PostgreSQL uses posix_fallocate() to create + // the shmem segment, which is really just a file in /dev/shm in Linux, + // but posix_fallocate() on tmpfs returns ENOMEM if the size is larger + // than available RAM. + // + // Using 'dynamic_shared_memory_type = mmap' works around that, because + // the Postgres 'mmap' DSM implementation doesn't use + // posix_fallocate(). Instead, it uses repeated calls to write(2) to + // fill the file with zeros. It's weird that that differs between + // 'posix' and 'mmap', but we take advantage of it. When the file is + // filled slowly with write(2), the kernel allows it to grow larger, as + // long as there's swap available. + // + // In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM + // segment to be larger than currently available RAM. But because we + // don't want to store it on a real file, which the kernel would try to + // flush to disk, so symlink pg_dynshm to /dev/shm. + // + // We don't set 'dynamic_shared_memory_type = mmap' here, we let the + // control plane control that option. If 'mmap' is not used, this + // symlink doesn't affect anything. + // + // See https://github.com/neondatabase/autoscaling/issues/800 + std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?; + symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?; + match spec.mode { ComputeMode::Primary => {} ComputeMode::Replica | ComputeMode::Static(..) => { @@ -640,8 +723,12 @@ impl ComputeNode { // Stop it when it's ready info!("waiting for postgres"); wait_for_postgres(&mut pg, Path::new(pgdata))?; - pg.kill()?; - info!("sent kill signal"); + // SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL + // it to avoid orphaned processes prowling around while datadir is + // wiped. + let pm_pid = Pid::from_raw(pg.id() as i32); + kill(pm_pid, Signal::SIGQUIT)?; + info!("sent SIGQUIT signal"); pg.wait()?; info!("done prewarming"); @@ -652,11 +739,12 @@ impl ComputeNode { /// Start Postgres as a child process and manage DBs/roles. /// After that this will hang waiting on the postmaster process to exit. + /// Returns a handle to the child process and a handle to the logs thread. #[instrument(skip_all)] pub fn start_postgres( &self, storage_auth_token: Option, - ) -> Result { + ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { let pgdata_path = Path::new(&self.pgdata); // Run postgres as a child process. @@ -667,13 +755,38 @@ impl ComputeNode { } else { vec![] }) + .stderr(Stdio::piped()) .spawn() .expect("cannot start postgres process"); PG_PID.store(pg.id(), Ordering::SeqCst); + // Start a thread to collect logs from stderr. + let stderr = pg.stderr.take().expect("stderr should be captured"); + let logs_handle = handle_postgres_logs(stderr); + wait_for_postgres(&mut pg, pgdata_path)?; - Ok(pg) + Ok((pg, logs_handle)) + } + + /// Do post configuration of the already started Postgres. This function spawns a background thread to + /// configure the database after applying the compute spec. Currently, it upgrades the neon extension + /// version. In the future, it may upgrade all 3rd-party extensions. + #[instrument(skip_all)] + pub fn post_apply_config(&self) -> Result<()> { + let connstr = self.connstr.clone(); + thread::spawn(move || { + let func = || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + handle_neon_extension_upgrade(&mut client) + .context("handle_neon_extension_upgrade")?; + Ok::<_, anyhow::Error>(()) + }; + if let Err(err) = func() { + error!("error while post_apply_config: {err:#}"); + } + }); + Ok(()) } /// Do initial configuration of the already started Postgres. @@ -685,49 +798,78 @@ impl ComputeNode { // In this case we need to connect with old `zenith_admin` name // and create new user. We cannot simply rename connected user, // but we can create a new one and grant it all privileges. - let mut client = match Client::connect(self.connstr.as_str(), NoTls) { - Err(e) => { - info!( - "cannot connect to postgres: {}, retrying with `zenith_admin` username", - e - ); - let mut zenith_admin_connstr = self.connstr.clone(); + let connstr = self.connstr.clone(); + let mut client = match Client::connect(connstr.as_str(), NoTls) { + Err(e) => match e.code() { + Some(&SqlState::INVALID_PASSWORD) + | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { + // connect with zenith_admin if cloud_admin could not authenticate + info!( + "cannot connect to postgres: {}, retrying with `zenith_admin` username", + e + ); + let mut zenith_admin_connstr = connstr.clone(); - zenith_admin_connstr - .set_username("zenith_admin") - .map_err(|_| anyhow::anyhow!("invalid connstr"))?; + zenith_admin_connstr + .set_username("zenith_admin") + .map_err(|_| anyhow::anyhow!("invalid connstr"))?; - let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?; - // Disable forwarding so that users don't get a cloud_admin role - client.simple_query("SET neon.forward_ddl = false")?; - client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; - client.simple_query("GRANT zenith_admin TO cloud_admin")?; - drop(client); + let mut client = + Client::connect(zenith_admin_connstr.as_str(), NoTls) + .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; + // Disable forwarding so that users don't get a cloud_admin role - // reconnect with connsting with expected name - Client::connect(self.connstr.as_str(), NoTls)? - } + let mut func = || { + client.simple_query("SET neon.forward_ddl = false")?; + client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?; + client.simple_query("GRANT zenith_admin TO cloud_admin")?; + Ok::<_, anyhow::Error>(()) + }; + func().context("apply_config setup cloud_admin")?; + + drop(client); + + // reconnect with connstring with expected name + Client::connect(connstr.as_str(), NoTls)? + } + _ => return Err(e.into()), + }, Ok(client) => client, }; // Disable DDL forwarding because control plane already knows about these roles/databases. - client.simple_query("SET neon.forward_ddl = false")?; + client + .simple_query("SET neon.forward_ddl = false") + .context("apply_config SET neon.forward_ddl = false")?; // Proceed with post-startup configuration. Note, that order of operations is important. let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; - create_neon_superuser(spec, &mut client)?; - cleanup_instance(&mut client)?; - handle_roles(spec, &mut client)?; - handle_databases(spec, &mut client)?; - handle_role_deletions(spec, self.connstr.as_str(), &mut client)?; - handle_grants(spec, &mut client, self.connstr.as_str())?; - handle_extensions(spec, &mut client)?; - handle_extension_neon(&mut client)?; - create_availability_check_data(&mut client)?; + create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?; + cleanup_instance(&mut client).context("apply_config cleanup_instance")?; + handle_roles(spec, &mut client).context("apply_config handle_roles")?; + handle_databases(spec, &mut client).context("apply_config handle_databases")?; + handle_role_deletions(spec, connstr.as_str(), &mut client) + .context("apply_config handle_role_deletions")?; + handle_grants( + spec, + &mut client, + connstr.as_str(), + self.has_feature(ComputeFeature::AnonExtension), + ) + .context("apply_config handle_grants")?; + handle_extensions(spec, &mut client).context("apply_config handle_extensions")?; + handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?; + create_availability_check_data(&mut client) + .context("apply_config create_availability_check_data")?; // 'Close' connection drop(client); + // Run migrations separately to not hold up cold starts + thread::spawn(move || { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + handle_migrations(&mut client).context("apply_config handle_migrations") + }); Ok(()) } @@ -750,8 +892,8 @@ impl ComputeNode { pub fn reconfigure(&self) -> Result<()> { let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec; - if let Some(connstr) = &self.pgbouncer_connstr { - info!("tuning pgbouncer with connstr: {:?}", connstr); + if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { + info!("tuning pgbouncer"); let rt = tokio::runtime::Builder::new_current_thread() .enable_all() @@ -760,15 +902,9 @@ impl ComputeNode { // Spawn a thread to do the tuning, // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = spec.pgbouncer_settings.clone(); - let connstr_clone = connstr.clone(); - let pgbouncer_ini_path = self.pgbouncer_ini_path.clone(); + let pgbouncer_settings = pgbouncer_settings.clone(); let _handle = thread::spawn(move || { - let res = rt.block_on(tune_pgbouncer( - pgbouncer_settings, - &connstr_clone, - pgbouncer_ini_path, - )); + let res = rt.block_on(tune_pgbouncer(pgbouncer_settings)); if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -782,29 +918,39 @@ impl ComputeNode { // temporarily reset max_cluster_size in config // to avoid the possibility of hitting the limit, while we are reconfiguring: // creating new extensions, roles, etc... - config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; - self.pg_reload_conf()?; + config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { + self.pg_reload_conf()?; - let mut client = Client::connect(self.connstr.as_str(), NoTls)?; + let mut client = Client::connect(self.connstr.as_str(), NoTls)?; - // Proceed with post-startup configuration. Note, that order of operations is important. - // Disable DDL forwarding because control plane already knows about these roles/databases. - if spec.mode == ComputeMode::Primary { - client.simple_query("SET neon.forward_ddl = false")?; - cleanup_instance(&mut client)?; - handle_roles(&spec, &mut client)?; - handle_databases(&spec, &mut client)?; - handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; - handle_grants(&spec, &mut client, self.connstr.as_str())?; - handle_extensions(&spec, &mut client)?; - handle_extension_neon(&mut client)?; - } + // Proceed with post-startup configuration. Note, that order of operations is important. + // Disable DDL forwarding because control plane already knows about these roles/databases. + if spec.mode == ComputeMode::Primary { + client.simple_query("SET neon.forward_ddl = false")?; + cleanup_instance(&mut client)?; + handle_roles(&spec, &mut client)?; + handle_databases(&spec, &mut client)?; + handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; + handle_grants( + &spec, + &mut client, + self.connstr.as_str(), + self.has_feature(ComputeFeature::AnonExtension), + )?; + handle_extensions(&spec, &mut client)?; + handle_extension_neon(&mut client)?; + // We can skip handle_migrations here because a new migration can only appear + // if we have a new version of the compute_ctl binary, which can only happen + // if compute got restarted, in which case we'll end up inside of apply_config + // instead of reconfigure. + } - // 'Close' connection - drop(client); + // 'Close' connection + drop(client); + + Ok(()) + })?; - // reset max_cluster_size in config back to original value and reload config - config::compute_ctl_temp_override_remove(pgdata_path)?; self.pg_reload_conf()?; let unknown_op = "unknown".to_string(); @@ -818,7 +964,10 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute(&self, extension_server_port: u16) -> Result { + pub fn start_compute( + &self, + extension_server_port: u16, + ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( @@ -830,8 +979,8 @@ impl ComputeNode { ); // tune pgbouncer - if let Some(connstr) = &self.pgbouncer_connstr { - info!("tuning pgbouncer with connstr: {:?}", connstr); + if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { + info!("tuning pgbouncer"); let rt = tokio::runtime::Builder::new_current_thread() .enable_all() @@ -840,15 +989,9 @@ impl ComputeNode { // Spawn a thread to do the tuning, // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone(); - let connstr_clone = connstr.clone(); - let pgbouncer_ini_path = self.pgbouncer_ini_path.clone(); + let pgbouncer_settings = pgbouncer_settings.clone(); let _handle = thread::spawn(move || { - let res = rt.block_on(tune_pgbouncer( - pgbouncer_settings, - &connstr_clone, - pgbouncer_ini_path, - )); + let res = rt.block_on(tune_pgbouncer(pgbouncer_settings)); if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -889,21 +1032,29 @@ impl ComputeNode { self.prepare_pgdata(&compute_state, extension_server_port)?; let start_time = Utc::now(); - let pg = self.start_postgres(pspec.storage_auth_token.clone())?; + let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?; - self.pg_reload_conf()?; + if pspec.spec.mode == ComputeMode::Primary { + if !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::with_compute_ctl_tmp_override( + pgdata_path, + "neon.max_cluster_size=-1", + || { + self.pg_reload_conf()?; - self.apply_config(&compute_state)?; + self.apply_config(&compute_state)?; - config::compute_ctl_temp_override_remove(pgdata_path)?; - self.pg_reload_conf()?; + Ok(()) + }, + )?; + self.pg_reload_conf()?; + } + self.post_apply_config()?; } let startup_end_time = Utc::now(); @@ -939,7 +1090,17 @@ impl ComputeNode { }; info!(?metrics, "compute start finished"); - Ok(pg) + Ok(pg_process) + } + + /// Update the `last_active` in the shared state, but ensure that it's a more recent one. + pub fn update_last_active(&self, last_active: Option>) { + let mut state = self.state.lock().unwrap(); + // NB: `Some()` is always greater than `None`. + if last_active > state.last_active { + state.last_active = last_active; + debug!("set the last compute activity time to: {:?}", last_active); + } } // Look for core dumps and collect backtraces. @@ -1118,10 +1279,12 @@ LIMIT 100", .await .map_err(DownloadError::Other); - self.ext_download_progress - .write() - .expect("bad lock") - .insert(ext_archive_name.to_string(), (download_start, true)); + if download_size.is_ok() { + self.ext_download_progress + .write() + .expect("bad lock") + .insert(ext_archive_name.to_string(), (download_start, true)); + } download_size } @@ -1214,3 +1377,17 @@ LIMIT 100", Ok(remote_ext_metrics) } } + +pub fn forward_termination_signal() { + let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst); + if ss_pid != 0 { + let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32); + kill(ss_pid, Signal::SIGTERM).ok(); + } + let pg_pid = PG_PID.load(Ordering::SeqCst); + if pg_pid != 0 { + let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); + // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html + kill(pg_pid, Signal::SIGQUIT).ok(); + } +} diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index a7ef8cea92..2c4aec4116 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,8 +6,8 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::escape_conf_value; -use crate::pg_helpers::PgOptionsSerialize; -use compute_api::spec::{ComputeMode, ComputeSpec}; +use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize}; +use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -17,6 +17,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { .write(true) .create(true) .append(false) + .truncate(false) .open(path)?; let buf = io::BufReader::new(&file); let mut count: usize = 0; @@ -51,6 +52,9 @@ pub fn write_postgres_conf( if let Some(s) = &spec.pageserver_connstring { writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?; } + if let Some(stripe_size) = spec.shard_stripe_size { + writeln!(file, "neon.stripe_size={stripe_size}")?; + } if !spec.safekeeper_connstrings.is_empty() { writeln!( file, @@ -79,6 +83,33 @@ pub fn write_postgres_conf( ComputeMode::Replica => { // hot_standby is 'on' by default, but let's be explicit writeln!(file, "hot_standby=on")?; + + // Inform the replica about the primary state + // Default is 'false' + if let Some(primary_is_running) = spec.primary_is_running { + writeln!(file, "neon.primary_is_running={}", primary_is_running)?; + } + } + } + + if cfg!(target_os = "linux") { + // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is + // disabled), then the control plane has enabled swap and we should set + // dynamic_shared_memory_type = 'mmap'. + // + // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047. + let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory") + // ignore any errors - they may be expected to occur under certain situations (e.g. when + // not running in Linux). + .unwrap_or_else(|_| String::new()); + if overcommit_memory_contents.trim() == "2" { + let opt = GenericOption { + name: "dynamic_shared_memory_type".to_owned(), + value: Some("mmap".to_owned()), + vartype: "enum".to_owned(), + }; + + write!(file, "{}", opt.to_pg_setting())?; } } @@ -100,18 +131,17 @@ pub fn write_postgres_conf( Ok(()) } -/// create file compute_ctl_temp_override.conf in pgdata_dir -/// add provided options to this file -pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> { +pub fn with_compute_ctl_tmp_override(pgdata_path: &Path, options: &str, exec: F) -> Result<()> +where + F: FnOnce() -> Result<()>, +{ let path = pgdata_path.join("compute_ctl_temp_override.conf"); let mut file = File::create(path)?; write!(file, "{}", options)?; - Ok(()) -} -/// remove file compute_ctl_temp_override.conf in pgdata_dir -pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> { - let path = pgdata_path.join("compute_ctl_temp_override.conf"); - std::fs::remove_file(path)?; - Ok(()) + let res = exec(); + + file.set_len(0)?; + + res } diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 2cec12119f..ef1db73982 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json } } */ -use anyhow::{self, Result}; +use anyhow::Result; use anyhow::{bail, Context}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index fa2c4cff28..43d29402bc 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -5,18 +5,21 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; +use crate::catalog::SchemaDumpError; +use crate::catalog::{get_database_schema, get_dbs_and_roles}; +use crate::compute::forward_termination_signal; use crate::compute::{ComputeNode, ComputeState, ParsedSpec}; use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; use anyhow::Result; +use hyper::header::CONTENT_TYPE; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; -use num_cpus; -use serde_json; use tokio::task; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; use tracing_utils::http::OtelName; +use utils::http::request::must_get_query_param; fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { ComputeStatusResponse { @@ -45,7 +48,7 @@ async fn routes(req: Request, compute: &Arc) -> Response { - info!("serving /status GET request"); + debug!("serving /status GET request"); let state = compute.state.lock().unwrap(); let status_response = status_response_from_state(&state); Response::new(Body::from(serde_json::to_string(&status_response).unwrap())) @@ -123,6 +126,45 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /terminate POST request"); + match handle_terminate_request(compute).await { + Ok(()) => Response::new(Body::empty()), + Err((msg, code)) => { + error!("error handling /terminate request: {msg}"); + render_json_error(&msg, code) + } + } + } + + (&Method::GET, "/dbs_and_roles") => { + info!("serving /dbs_and_roles GET request",); + match get_dbs_and_roles(compute).await { + Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())), + Err(_) => { + render_json_error("can't get dbs and roles", StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + + (&Method::GET, "/database_schema") => { + let database = match must_get_query_param(&req, "database") { + Err(e) => return e.into_response(), + Ok(database) => database, + }; + info!("serving /database_schema GET request with database: {database}",); + match get_database_schema(compute, &database).await { + Ok(res) => render_plain(Body::wrap_stream(res)), + Err(SchemaDumpError::DatabaseDoesNotExist) => { + render_json_error("database does not exist", StatusCode::NOT_FOUND) + } + Err(e) => { + error!("can't get schema dump: {}", e); + render_json_error("can't get schema dump", StatusCode::INTERNAL_SERVER_ERROR) + } + } + } + // download extension files from remote extension storage on demand (&Method::POST, route) if route.starts_with("/extension_server/") => { info!("serving {:?} POST request", route); @@ -293,10 +335,68 @@ fn render_json_error(e: &str, status: StatusCode) -> Response { }; Response::builder() .status(status) + .header(CONTENT_TYPE, "application/json") .body(Body::from(serde_json::to_string(&error).unwrap())) .unwrap() } +fn render_json(body: Body) -> Response { + Response::builder() + .header(CONTENT_TYPE, "application/json") + .body(body) + .unwrap() +} + +fn render_plain(body: Body) -> Response { + Response::builder() + .header(CONTENT_TYPE, "text/plain") + .body(body) + .unwrap() +} + +async fn handle_terminate_request(compute: &Arc) -> Result<(), (String, StatusCode)> { + { + let mut state = compute.state.lock().unwrap(); + if state.status == ComputeStatus::Terminated { + return Ok(()); + } + if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for termination request: {:?}", + state.status.clone() + ); + return Err((msg, StatusCode::PRECONDITION_FAILED)); + } + state.status = ComputeStatus::TerminationPending; + compute.state_changed.notify_all(); + drop(state); + } + forward_termination_signal(); + info!("sent signal and notified waiters"); + + // Spawn a blocking thread to wait for compute to become Terminated. + // This is needed to do not block the main pool of workers and + // be able to serve other requests while some particular request + // is waiting for compute to finish configuration. + let c = compute.clone(); + task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Terminated { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become Terminated, current status: {:?}", + state.status + ); + } + + Ok(()) + }) + .await + .unwrap()?; + info!("terminated Postgres"); + Ok(()) +} + // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] async fn serve(port: u16, state: Arc) { diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index cedc6ece8f..b0ddaeae2b 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -68,6 +68,51 @@ paths: schema: $ref: "#/components/schemas/Info" + /dbs_and_roles: + get: + tags: + - Info + summary: Get databases and roles in the catalog. + description: "" + operationId: getDbsAndRoles + responses: + 200: + description: Compute schema objects + content: + application/json: + schema: + $ref: "#/components/schemas/DbsAndRoles" + + /database_schema: + get: + tags: + - Info + summary: Get schema dump + parameters: + - name: database + in: query + description: Database name to dump. + required: true + schema: + type: string + example: "postgres" + description: Get schema dump in SQL format. + operationId: getDatabaseSchema + responses: + 200: + description: Schema dump + content: + text/plain: + schema: + type: string + description: Schema dump in SQL format. + 404: + description: Non existing database. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + /check_writability: post: tags: @@ -168,6 +213,29 @@ paths: schema: $ref: "#/components/schemas/GenericError" + /terminate: + post: + tags: + - Terminate + summary: Terminate Postgres and wait for it to exit + description: "" + operationId: terminate + responses: + 200: + description: Result + 412: + description: "wrong state" + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: "Unexpected error" + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + components: securitySchemes: JWT: @@ -206,6 +274,73 @@ components: num_cpus: type: integer + DbsAndRoles: + type: object + description: Databases and Roles + required: + - roles + - databases + properties: + roles: + type: array + items: + $ref: "#/components/schemas/Role" + databases: + type: array + items: + $ref: "#/components/schemas/Database" + + Database: + type: object + description: Database + required: + - name + - owner + - restrict_conn + - invalid + properties: + name: + type: string + owner: + type: string + options: + type: array + items: + $ref: "#/components/schemas/GenericOption" + restrict_conn: + type: boolean + invalid: + type: boolean + + Role: + type: object + description: Role + required: + - name + properties: + name: + type: string + encrypted_password: + type: string + options: + type: array + items: + $ref: "#/components/schemas/GenericOption" + + GenericOption: + type: object + description: Schema Generic option + required: + - name + - vartype + properties: + name: + type: string + value: + type: string + vartype: + type: string + ComputeState: type: object required: diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 4e01ffd954..18c228ba54 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -8,10 +8,12 @@ pub mod configurator; pub mod http; #[macro_use] pub mod logger; +pub mod catalog; pub mod compute; pub mod extension_server; pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; +pub mod swap; pub mod sync_sk; diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 3ae68de8ef..84be5b0809 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -38,3 +38,9 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { Ok(()) } + +/// Replace all newline characters with a special character to make it +/// easier to grep for log messages. +pub fn inlinify(s: &str) -> String { + s.replace('\n', "\u{200B}") +} diff --git a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql new file mode 100644 index 0000000000..73b36a37f6 --- /dev/null +++ b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql @@ -0,0 +1 @@ +ALTER ROLE neon_superuser BYPASSRLS; diff --git a/compute_tools/src/migrations/0001-alter_roles.sql b/compute_tools/src/migrations/0001-alter_roles.sql new file mode 100644 index 0000000000..6cb49f873f --- /dev/null +++ b/compute_tools/src/migrations/0001-alter_roles.sql @@ -0,0 +1,18 @@ +DO $$ +DECLARE + role_name text; +BEGIN + FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member') + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT'; + END LOOP; + + FOR role_name IN SELECT rolname FROM pg_roles + WHERE + NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_') + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS'; + END LOOP; +END $$; diff --git a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql new file mode 100644 index 0000000000..37f0ce211f --- /dev/null +++ b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql @@ -0,0 +1,6 @@ +DO $$ +BEGIN + IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN + EXECUTE 'GRANT pg_create_subscription TO neon_superuser'; + END IF; +END $$; diff --git a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql new file mode 100644 index 0000000000..11afd3b635 --- /dev/null +++ b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql @@ -0,0 +1 @@ +GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION; diff --git a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql new file mode 100644 index 0000000000..8abe052494 --- /dev/null +++ b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql @@ -0,0 +1,4 @@ +-- SKIP: Deemed insufficient for allowing relations created by extensions to be +-- interacted with by neon_superuser without permission issues. + +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser; diff --git a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql new file mode 100644 index 0000000000..5bcb026e0c --- /dev/null +++ b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql @@ -0,0 +1,4 @@ +-- SKIP: Deemed insufficient for allowing relations created by extensions to be +-- interacted with by neon_superuser without permission issues. + +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser; diff --git a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql new file mode 100644 index 0000000000..ce7c96753e --- /dev/null +++ b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql @@ -0,0 +1,3 @@ +-- SKIP: Moved inline to the handle_grants() functions. + +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION; diff --git a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql new file mode 100644 index 0000000000..72baf920cd --- /dev/null +++ b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql @@ -0,0 +1,3 @@ +-- SKIP: Moved inline to the handle_grants() functions. + +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION; diff --git a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql new file mode 100644 index 0000000000..47129d65b8 --- /dev/null +++ b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql @@ -0,0 +1,13 @@ +-- SKIP: The original goal of this migration was to prevent creating +-- subscriptions, but this migration was insufficient. + +DO $$ +DECLARE + role_name TEXT; +BEGIN + FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE + LOOP + RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name); + EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION'; + END LOOP; +END $$; diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index fd19b7e53f..872a3f7750 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -3,88 +3,118 @@ use std::{thread, time::Duration}; use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use crate::compute::ComputeNode; +use compute_api::responses::ComputeStatus; +use compute_api::spec::ComputeFeature; const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); // Spin in a loop and figure out the last activity time in the Postgres. // Then update it in the shared state. This function never errors out. -// XXX: the only expected panic is at `RwLock` unwrap(). +// NB: the only expected panic is at `Mutex` unwrap(), all other errors +// should be handled gracefully. fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change let connstr = compute.connstr.as_str(); + + // During startup and configuration we connect to every Postgres database, + // but we don't want to count this as some user activity. So wait until + // the compute fully started before monitoring activity. + wait_for_postgres_start(compute); + // Define `client` outside of the loop to reuse existing connection if it's active. let mut client = Client::connect(connstr, NoTls); - info!("watching Postgres activity at {}", connstr); + let mut sleep = false; + let mut prev_active_time: Option = None; + let mut prev_sessions: Option = None; + + if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) { + info!("starting experimental activity monitor for {}", connstr); + } else { + info!("starting activity monitor for {}", connstr); + } loop { - // Should be outside of the write lock to allow others to read while we sleep. - thread::sleep(MONITOR_CHECK_INTERVAL); + // We use `continue` a lot, so it's more convenient to sleep at the top of the loop. + // But skip the first sleep, so we can connect to Postgres immediately. + if sleep { + // Should be outside of the mutex lock to allow others to read while we sleep. + thread::sleep(MONITOR_CHECK_INTERVAL); + } else { + sleep = true; + } match &mut client { Ok(cli) => { if cli.is_closed() { - info!("connection to postgres closed, trying to reconnect"); + info!("connection to Postgres is closed, trying to reconnect"); // Connection is closed, reconnect and try again. client = Client::connect(connstr, NoTls); continue; } - // Get all running client backends except ourself, use RFC3339 DateTime format. - let backends = cli - .query( - "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change - FROM pg_stat_activity - WHERE backend_type = 'client backend' - AND pid != pg_backend_pid() - AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? - &[], - ); - let mut last_active = compute.state.lock().unwrap().last_active; + // This is a new logic, only enable if the feature flag is set. + // TODO: remove this once we are sure that it works OR drop it altogether. + if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) { + // First, check if the total active time or sessions across all databases has changed. + // If it did, it means that user executed some queries. In theory, it can even go down if + // some databases were dropped, but it's still a user activity. + match get_database_stats(cli) { + Ok((active_time, sessions)) => { + let mut detected_activity = false; - if let Ok(backs) = backends { - let mut idle_backs: Vec> = vec![]; - - for b in backs.into_iter() { - let state: String = match b.try_get("state") { - Ok(state) => state, - Err(_) => continue, - }; - - if state == "idle" { - let change: String = match b.try_get("state_change") { - Ok(state_change) => state_change, - Err(_) => continue, - }; - let change = DateTime::parse_from_rfc3339(&change); - match change { - Ok(t) => idle_backs.push(t.with_timezone(&Utc)), - Err(e) => { - info!("cannot parse backend state_change DateTime: {}", e); - continue; + prev_active_time = match prev_active_time { + Some(prev_active_time) => { + if active_time != prev_active_time { + detected_activity = true; + } + Some(active_time) } - } - } else { - // Found non-idle backend, so the last activity is NOW. - // Save it and exit the for loop. Also clear the idle backend - // `state_change` timestamps array as it doesn't matter now. - last_active = Some(Utc::now()); - idle_backs.clear(); - break; - } - } + None => Some(active_time), + }; + prev_sessions = match prev_sessions { + Some(prev_sessions) => { + if sessions != prev_sessions { + detected_activity = true; + } + Some(sessions) + } + None => Some(sessions), + }; - // Get idle backend `state_change` with the max timestamp. - if let Some(last) = idle_backs.iter().max() { - last_active = Some(*last); + if detected_activity { + // Update the last active time and continue, we don't need to + // check backends state change. + compute.update_last_active(Some(Utc::now())); + continue; + } + } + Err(e) => { + error!("could not get database statistics: {}", e); + continue; + } } } - // If there are existing (logical) walsenders, do not suspend. + // Second, if database statistics is the same, check all backends state change, + // maybe there is some with more recent activity. `get_backends_state_change()` + // can return None or stale timestamp, so it's `compute.update_last_active()` + // responsibility to check if the new timestamp is more recent than the current one. + // This helps us to discover new sessions, that did nothing yet. + match get_backends_state_change(cli) { + Ok(last_active) => { + compute.update_last_active(last_active); + } + Err(e) => { + error!("could not get backends state change: {}", e); + } + } + + // Finally, if there are existing (logical) walsenders, do not suspend. // // walproposer doesn't currently show up in pg_stat_replication, // but protect if it will be @@ -93,11 +123,12 @@ fn watch_compute_activity(compute: &ComputeNode) { Ok(r) => match r.try_get::<&str, i64>("count") { Ok(num_ws) => { if num_ws > 0 { - last_active = Some(Utc::now()); + compute.update_last_active(Some(Utc::now())); + continue; } } Err(e) => { - warn!("failed to parse ws count: {:?}", e); + warn!("failed to parse walsenders count: {:?}", e); continue; } }, @@ -106,17 +137,59 @@ fn watch_compute_activity(compute: &ComputeNode) { continue; } } - - // Update the last activity in the shared state if we got a more recent one. - let mut state = compute.state.lock().unwrap(); - // NB: `Some()` is always greater than `None`. - if last_active > state.last_active { - state.last_active = last_active; - debug!("set the last compute activity time to: {:?}", last_active); + // + // Don't suspend compute if there is an active logical replication subscription + // + // `where pid is not null` – to filter out read only computes and subscription on branches + // + let logical_subscriptions_query = + "select count(*) from pg_stat_subscription where pid is not null;"; + match cli.query_one(logical_subscriptions_query, &[]) { + Ok(row) => match row.try_get::<&str, i64>("count") { + Ok(num_subscribers) => { + if num_subscribers > 0 { + compute.update_last_active(Some(Utc::now())); + continue; + } + } + Err(e) => { + warn!("failed to parse `pg_stat_subscription` count: {:?}", e); + continue; + } + }, + Err(e) => { + warn!( + "failed to get list of active logical replication subscriptions: {:?}", + e + ); + continue; + } + } + // + // Do not suspend compute if autovacuum is running + // + let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'"; + match cli.query_one(autovacuum_count_query, &[]) { + Ok(r) => match r.try_get::<&str, i64>("count") { + Ok(num_workers) => { + if num_workers > 0 { + compute.update_last_active(Some(Utc::now())); + continue; + } + } + Err(e) => { + warn!("failed to parse autovacuum workers count: {:?}", e); + continue; + } + }, + Err(e) => { + warn!("failed to get list of autovacuum workers: {:?}", e); + continue; + } } } Err(e) => { - debug!("cannot connect to postgres: {}, retrying", e); + debug!("could not connect to Postgres: {}, retrying", e); // Establish a new connection and try again. client = Client::connect(connstr, NoTls); @@ -125,12 +198,124 @@ fn watch_compute_activity(compute: &ComputeNode) { } } +// Hang on condition variable waiting until the compute status is `Running`. +fn wait_for_postgres_start(compute: &ComputeNode) { + let mut state = compute.state.lock().unwrap(); + while state.status != ComputeStatus::Running { + info!("compute is not running, waiting before monitoring activity"); + state = compute.state_changed.wait(state).unwrap(); + + if state.status == ComputeStatus::Running { + break; + } + } +} + +// Figure out the total active time and sessions across all non-system databases. +// Returned tuple is `(active_time, sessions)`. +// It can return `0.0` active time or `0` sessions, which means no user databases exist OR +// it was a start with skipped `pg_catalog` updates and user didn't do any queries +// (or open any sessions) yet. +fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> { + // Filter out `postgres` database as `compute_ctl` and other monitoring tools + // like `postgres_exporter` use it to query Postgres statistics. + // Use explicit 8 bytes type casts to match Rust types. + let stats = cli.query_one( + "SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time, + coalesce(sum(sessions), 0)::bigint AS total_sessions + FROM pg_stat_database + WHERE datname NOT IN ( + 'postgres', + 'template0', + 'template1' + );", + &[], + ); + let stats = match stats { + Ok(stats) => stats, + Err(e) => { + return Err(anyhow::anyhow!("could not query active_time: {}", e)); + } + }; + + let active_time: f64 = match stats.try_get("total_active_time") { + Ok(active_time) => active_time, + Err(e) => return Err(anyhow::anyhow!("could not get total_active_time: {}", e)), + }; + + let sessions: i64 = match stats.try_get("total_sessions") { + Ok(sessions) => sessions, + Err(e) => return Err(anyhow::anyhow!("could not get total_sessions: {}", e)), + }; + + Ok((active_time, sessions)) +} + +// Figure out the most recent state change time across all client backends. +// If there is currently active backend, timestamp will be `Utc::now()`. +// It can return `None`, which means no client backends exist or we were +// unable to parse the timestamp. +fn get_backends_state_change(cli: &mut Client) -> anyhow::Result>> { + let mut last_active: Option> = None; + // Get all running client backends except ourself, use RFC3339 DateTime format. + let backends = cli.query( + "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change + FROM pg_stat_activity + WHERE backend_type = 'client backend' + AND pid != pg_backend_pid() + AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? + &[], + ); + + match backends { + Ok(backs) => { + let mut idle_backs: Vec> = vec![]; + + for b in backs.into_iter() { + let state: String = match b.try_get("state") { + Ok(state) => state, + Err(_) => continue, + }; + + if state == "idle" { + let change: String = match b.try_get("state_change") { + Ok(state_change) => state_change, + Err(_) => continue, + }; + let change = DateTime::parse_from_rfc3339(&change); + match change { + Ok(t) => idle_backs.push(t.with_timezone(&Utc)), + Err(e) => { + info!("cannot parse backend state_change DateTime: {}", e); + continue; + } + } + } else { + // Found non-idle backend, so the last activity is NOW. + // Return immediately, no need to check other backends. + return Ok(Some(Utc::now())); + } + } + + // Get idle backend `state_change` with the max timestamp. + if let Some(last) = idle_backs.iter().max() { + last_active = Some(*last); + } + } + Err(e) => { + return Err(anyhow::anyhow!("could not query backends: {}", e)); + } + } + + Ok(last_active) +} + /// Launch a separate compute monitor thread and return its `JoinHandle`. -pub fn launch_monitor(state: &Arc) -> thread::JoinHandle<()> { - let state = Arc::clone(state); +pub fn launch_monitor(compute: &Arc) -> thread::JoinHandle<()> { + let compute = Arc::clone(compute); thread::Builder::new() .name("compute-monitor".into()) - .spawn(move || watch_compute_activity(&state)) + .spawn(move || watch_compute_activity(&compute)) .expect("cannot launch compute monitor thread") } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 0b0e137c03..fa0822748b 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -6,12 +6,15 @@ use std::io::{BufRead, BufReader}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; +use std::thread::JoinHandle; use std::time::{Duration, Instant}; use anyhow::{bail, Result}; use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::{Client, Transaction}; +use tokio::io::AsyncBufReadExt; +use tokio::time::timeout; use tokio_postgres::NoTls; use tracing::{debug, error, info, instrument}; @@ -41,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String { format!("'{}'", res) } -trait GenericOptionExt { +pub trait GenericOptionExt { fn to_pg_option(&self) -> String; fn to_pg_setting(&self) -> String; } @@ -261,9 +264,10 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> { // case we miss some events for some reason. Not strictly necessary, but // better safe than sorry. let (tx, rx) = std::sync::mpsc::channel(); - let (mut watcher, rx): (Box, _) = match notify::recommended_watcher(move |res| { + let watcher_res = notify::recommended_watcher(move |res| { let _ = tx.send(res); - }) { + }); + let (mut watcher, rx): (Box, _) = match watcher_res { Ok(watcher) => (Box::new(watcher), rx), Err(e) => { match e.kind { @@ -363,7 +367,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> { } /// Update pgbouncer.ini with provided options -pub fn update_pgbouncer_ini( +fn update_pgbouncer_ini( pgbouncer_config: HashMap, pgbouncer_ini_path: &str, ) -> Result<()> { @@ -372,6 +376,10 @@ pub fn update_pgbouncer_ini( for (option_name, value) in pgbouncer_config.iter() { section.insert(option_name, value); + debug!( + "Updating pgbouncer.ini with new values {}={}", + option_name, value + ); } conf.write_to_file(pgbouncer_ini_path)?; @@ -381,46 +389,146 @@ pub fn update_pgbouncer_ini( /// Tune pgbouncer. /// 1. Apply new config using pgbouncer admin console /// 2. Add new values to pgbouncer.ini to preserve them after restart -pub async fn tune_pgbouncer( - pgbouncer_settings: Option>, - pgbouncer_connstr: &str, - pgbouncer_ini_path: Option, -) -> Result<()> { - if let Some(pgbouncer_config) = pgbouncer_settings { - // Apply new config - let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await; - let (client, connection) = connect_result.unwrap(); - tokio::spawn(async move { - if let Err(e) = connection.await { - eprintln!("connection error: {}", e); +pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result<()> { + let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() { + // for VMs use pgbouncer specific way to connect to + // pgbouncer admin console without password + // when pgbouncer is running under the same user. + "host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string() + } else { + // for k8s use normal connection string with password + // to connect to pgbouncer admin console + let mut pgbouncer_connstr = + "host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string(); + if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") { + pgbouncer_connstr.push_str(format!(" password={}", pass).as_str()); + } + pgbouncer_connstr + }; + + info!( + "Connecting to pgbouncer with connection string: {}", + pgbouncer_connstr + ); + + // connect to pgbouncer, retrying several times + // because pgbouncer may not be ready yet + let mut retries = 3; + let client = loop { + match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await { + Ok((client, connection)) => { + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + break client; } - }); + Err(e) => { + if retries == 0 { + return Err(e.into()); + } + error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e); + retries -= 1; + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + }; - for (option_name, value) in pgbouncer_config.iter() { - info!( - "Applying pgbouncer setting change: {} = {}", - option_name, value + // Apply new config + for (option_name, value) in pgbouncer_config.iter() { + let query = format!("SET {}={}", option_name, value); + // keep this log line for debugging purposes + info!("Applying pgbouncer setting change: {}", query); + + if let Err(err) = client.simple_query(&query).await { + // Don't fail on error, just print it into log + error!( + "Failed to apply pgbouncer setting change: {}, {}", + query, err ); - let query = format!("SET {} = {}", option_name, value); + }; + } - let result = client.simple_query(&query).await; + // save values to pgbouncer.ini + // so that they are preserved after pgbouncer restart + let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() { + // in VMs we use /etc/pgbouncer.ini + "/etc/pgbouncer.ini".to_string() + } else { + // in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini + // this is a shared volume between pgbouncer and postgres containers + // FIXME: fix permissions for this file + "/var/db/postgres/pgbouncer/pgbouncer.ini".to_string() + }; + update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?; - info!("Applying pgbouncer setting change: {}", query); - info!("pgbouncer setting change result: {:?}", result); + Ok(()) +} - if let Err(err) = result { - // Don't fail on error, just print it into log - error!( - "Failed to apply pgbouncer setting change: {}, {}", - query, err - ); - }; +/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs +/// and send them to the logger. In the future we may also want to add context to +/// these logs. +pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> { + std::thread::spawn(move || { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to build tokio runtime"); + + let res = runtime.block_on(async move { + let stderr = tokio::process::ChildStderr::from_std(stderr)?; + handle_postgres_logs_async(stderr).await + }); + if let Err(e) = res { + tracing::error!("error while processing postgres logs: {}", e); + } + }) +} + +/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions: +/// - next line starts with timestamp +/// - EOF +/// - no new lines were written for the last second +async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> { + let mut lines = tokio::io::BufReader::new(stderr).lines(); + let timeout_duration = Duration::from_millis(100); + let ts_regex = + regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid"); + + let mut buf = vec![]; + loop { + let next_line = timeout(timeout_duration, lines.next_line()).await; + + // we should flush lines from the buffer if we cannot continue reading multiline message + let should_flush_buf = match next_line { + // Flushing if new line starts with timestamp + Ok(Ok(Some(ref line))) => ts_regex.is_match(line), + // Flushing on EOF, timeout or error + _ => true, + }; + + if !buf.is_empty() && should_flush_buf { + // join multiline message into a single line, separated by unicode Zero Width Space. + // "PG:" suffix is used to distinguish postgres logs from other logs. + let combined = format!("PG:{}\n", buf.join("\u{200B}")); + buf.clear(); + + // sync write to stderr to avoid interleaving with other logs + use std::io::Write; + let res = std::io::stderr().lock().write_all(combined.as_bytes()); + if let Err(e) = res { + tracing::error!("error while writing to stderr: {}", e); + } } - // save values to pgbouncer.ini - // so that they are preserved after pgbouncer restart - if let Some(pgbouncer_ini_path) = pgbouncer_ini_path { - update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?; + // if not timeout, append line to the buffer + if next_line.is_ok() { + match next_line?? { + Some(line) => buf.push(line), + // EOF + None => break, + }; } } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index d545858dc2..143f6c1e5f 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -2,13 +2,14 @@ use std::fs::File; use std::path::Path; use std::str::FromStr; -use anyhow::{anyhow, bail, Result}; +use anyhow::{anyhow, bail, Context, Result}; use postgres::config::Config; use postgres::{Client, NoTls}; use reqwest::StatusCode; use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; use crate::config; +use crate::logger::inlinify; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; @@ -189,18 +190,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // Print a list of existing Postgres roles (only in debug mode) if span_enabled!(Level::INFO) { - info!("postgres roles:"); + let mut vec = Vec::new(); for r in &existing_roles { - info!( - " - {}:{}", + vec.push(format!( + "{}:{}", r.name, if r.encrypted_password.is_some() { "[FILTERED]" } else { "(null)" } - ); + )); } + + info!("postgres roles (total {}): {:?}", vec.len(), vec); } // Process delta operations first @@ -238,7 +241,10 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // Refresh Postgres roles info to handle possible roles renaming let existing_roles: Vec = get_existing_roles(&mut xact)?; - info!("cluster spec roles:"); + info!( + "handling cluster spec roles (total {})", + spec.cluster.roles.len() + ); for role in &spec.cluster.roles { let name = &role.name; // XXX: with a limited number of roles it is fine, but consider making it a HashMap @@ -301,7 +307,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser", name.pg_quote() ); - info!("role create query: '{}'", &query); + info!("running role create query: '{}'", &query); query.push_str(&role.to_pg_options()); xact.execute(query.as_str(), &[])?; } @@ -318,7 +324,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { RoleAction::Create => " -> create", RoleAction::Update => " -> update", }; - info!(" - {}:{}{}", name, pwd, action_str); + info!(" - {}:{}{}", name, pwd, action_str); } } @@ -427,10 +433,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // Print a list of existing Postgres databases (only in debug mode) if span_enabled!(Level::INFO) { - info!("postgres databases:"); + let mut vec = Vec::new(); for (dbname, db) in &existing_dbs { - info!(" {}:{}", dbname, db.owner); + vec.push(format!("{}:{}", dbname, db.owner)); } + info!("postgres databases (total {}): {:?}", vec.len(), vec); } // Process delta operations first @@ -483,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { "rename_db" => { let new_name = op.new_name.as_ref().unwrap(); - if existing_dbs.get(&op.name).is_some() { + if existing_dbs.contains_key(&op.name) { let query: String = format!( "ALTER DATABASE {} RENAME TO {}", op.name.pg_quote(), @@ -502,7 +509,10 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { // Refresh Postgres databases info to handle possible renames let existing_dbs = get_existing_dbs(client)?; - info!("cluster spec databases:"); + info!( + "handling cluster spec databases (total {})", + spec.cluster.databases.len() + ); for db in &spec.cluster.databases { let name = &db.name; let pg_db = existing_dbs.get(name); @@ -561,7 +571,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { DatabaseAction::Create => " -> create", DatabaseAction::Update => " -> update", }; - info!(" - {}:{}{}", db.name, db.owner, action_str); + info!(" - {}:{}{}", db.name, db.owner, action_str); } } @@ -571,7 +581,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants /// to allow users creating trusted extensions and re-creating `public` schema, for example. #[instrument(skip_all)] -pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> { +pub fn handle_grants( + spec: &ComputeSpec, + client: &mut Client, + connstr: &str, + enable_anon_extension: bool, +) -> Result<()> { info!("modifying database permissions"); let existing_dbs = get_existing_dbs(client)?; @@ -640,6 +655,9 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> // remove this code if possible. The worst thing that could happen is that // user won't be able to use public schema in NEW databases created in the // very OLD project. + // + // Also, alter default permissions so that relations created by extensions can be + // used by neon_superuser without permission issues. let grant_query = "DO $$\n\ BEGIN\n\ IF EXISTS(\n\ @@ -658,12 +676,31 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> GRANT CREATE ON SCHEMA public TO web_access;\n\ END IF;\n\ END IF;\n\ + IF EXISTS(\n\ + SELECT nspname\n\ + FROM pg_catalog.pg_namespace\n\ + WHERE nspname = 'public'\n\ + )\n\ + THEN\n\ + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;\n\ + ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;\n\ + END IF;\n\ END\n\ $$;" .to_string(); - info!("grant query for db {} : {}", &db.name, &grant_query); + info!( + "grant query for db {} : {}", + &db.name, + inlinify(&grant_query) + ); db_client.simple_query(&grant_query)?; + + // it is important to run this after all grants + if enable_anon_extension { + handle_extension_anon(spec, &db.owner, &mut db_client, false) + .context("handle_grants handle_extension_anon")?; + } } Ok(()) @@ -708,8 +745,234 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> { // - extension was just installed // - extension was already installed and is up to date let query = "ALTER EXTENSION neon UPDATE"; - info!("update neon extension schema with query: {}", query); + info!("update neon extension version with query: {}", query); + if let Err(e) = client.simple_query(query) { + error!( + "failed to upgrade neon extension during `handle_extension_neon`: {}", + e + ); + } + + Ok(()) +} + +#[instrument(skip_all)] +pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> { + info!("handle neon extension upgrade"); + let query = "ALTER EXTENSION neon UPDATE"; + info!("update neon extension version with query: {}", query); client.simple_query(query)?; Ok(()) } + +#[instrument(skip_all)] +pub fn handle_migrations(client: &mut Client) -> Result<()> { + info!("handle migrations"); + + // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN! + // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + // Add new migrations in numerical order. + let migrations = [ + include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"), + include_str!("./migrations/0001-alter_roles.sql"), + include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"), + include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"), + include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"), + include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"), + include_str!( + "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql" + ), + include_str!( + "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql" + ), + include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"), + ]; + + let mut func = || { + let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; + client.simple_query(query)?; + + let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; + client.simple_query(query)?; + + let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; + client.simple_query(query)?; + + let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; + client.simple_query(query)?; + + let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; + client.simple_query(query)?; + Ok::<_, anyhow::Error>(()) + }; + func().context("handle_migrations prepare")?; + + let query = "SELECT id FROM neon_migration.migration_id"; + let row = client + .query_one(query, &[]) + .context("handle_migrations get migration_id")?; + let mut current_migration: usize = row.get::<&str, i64>("id") as usize; + let starting_migration_id = current_migration; + + let query = "BEGIN"; + client + .simple_query(query) + .context("handle_migrations begin")?; + + while current_migration < migrations.len() { + let migration = &migrations[current_migration]; + if migration.starts_with("-- SKIP") { + info!("Skipping migration id={}", current_migration); + } else { + info!( + "Running migration id={}:\n{}\n", + current_migration, migration + ); + client.simple_query(migration).with_context(|| { + format!("handle_migrations current_migration={}", current_migration) + })?; + } + current_migration += 1; + } + let setval = format!( + "UPDATE neon_migration.migration_id SET id={}", + migrations.len() + ); + client + .simple_query(&setval) + .context("handle_migrations update id")?; + + let query = "COMMIT"; + client + .simple_query(query) + .context("handle_migrations commit")?; + + info!( + "Ran {} migrations", + (migrations.len() - starting_migration_id) + ); + + Ok(()) +} + +/// Connect to the database as superuser and pre-create anon extension +/// if it is present in shared_preload_libraries +#[instrument(skip_all)] +pub fn handle_extension_anon( + spec: &ComputeSpec, + db_owner: &str, + db_client: &mut Client, + grants_only: bool, +) -> Result<()> { + info!("handle extension anon"); + + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + if libs.contains("anon") { + if !grants_only { + // check if extension is already initialized using anon.is_initialized() + let query = "SELECT anon.is_initialized()"; + match db_client.query(query, &[]) { + Ok(rows) => { + if !rows.is_empty() { + let is_initialized: bool = rows[0].get(0); + if is_initialized { + info!("anon extension is already initialized"); + return Ok(()); + } + } + } + Err(e) => { + warn!( + "anon extension is_installed check failed with expected error: {}", + e + ); + } + }; + + // Create anon extension if this compute needs it + // Users cannot create it themselves, because superuser is required. + let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE"; + info!("creating anon extension with query: {}", query); + match db_client.query(query, &[]) { + Ok(_) => {} + Err(e) => { + error!("anon extension creation failed with error: {}", e); + return Ok(()); + } + } + + // check that extension is installed + query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; + let rows = db_client.query(query, &[])?; + if rows.is_empty() { + error!("anon extension is not installed"); + return Ok(()); + } + + // Initialize anon extension + // This also requires superuser privileges, so users cannot do it themselves. + query = "SELECT anon.init()"; + match db_client.query(query, &[]) { + Ok(_) => {} + Err(e) => { + error!("anon.init() failed with error: {}", e); + return Ok(()); + } + } + } + + // check that extension is installed, if not bail early + let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; + match db_client.query(query, &[]) { + Ok(rows) => { + if rows.is_empty() { + error!("anon extension is not installed"); + return Ok(()); + } + } + Err(e) => { + error!("anon extension check failed with error: {}", e); + return Ok(()); + } + }; + + let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + + // Grant permissions to db_owner to use anon extension functions + let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + + // This is needed, because some functions are defined as SECURITY DEFINER. + // In Postgres SECURITY DEFINER functions are executed with the privileges + // of the owner. + // In anon extension this it is needed to access some GUCs, which are only accessible to + // superuser. But we've patched postgres to allow db_owner to access them as well. + // So we need to change owner of these functions to db_owner. + let query = format!(" + SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};' + from pg_proc p + join pg_namespace nsp ON p.pronamespace = nsp.oid + where nsp.nspname = 'anon';", db_owner); + + info!("change anon extension functions owner to db owner"); + db_client.simple_query(&query)?; + + // affects views as well + let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + + let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner); + info!("granting anon extension permissions with query: {}", query); + db_client.simple_query(&query)?; + } + } + + Ok(()) +} diff --git a/compute_tools/src/swap.rs b/compute_tools/src/swap.rs new file mode 100644 index 0000000000..024c5b338e --- /dev/null +++ b/compute_tools/src/swap.rs @@ -0,0 +1,45 @@ +use std::path::Path; + +use anyhow::{anyhow, Context}; +use tracing::warn; + +pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap"; + +pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> { + // run `/neonvm/bin/resize-swap --once {size_bytes}` + // + // Passing '--once' causes resize-swap to delete itself after successful completion, which + // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while + // postgres is running. + // + // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg. + let child_result = std::process::Command::new("/usr/bin/sudo") + .arg(RESIZE_SWAP_BIN) + .arg("--once") + .arg(size_bytes.to_string()) + .spawn(); + + child_result + .context("spawn() failed") + .and_then(|mut child| child.wait().context("wait() failed")) + .and_then(|status| match status.success() { + true => Ok(()), + false => { + // The command failed. Maybe it was because the resize-swap file doesn't exist? + // The --once flag causes it to delete itself on success so we don't disable swap + // while postgres is running; maybe this is fine. + match Path::new(RESIZE_SWAP_BIN).try_exists() { + Err(_) | Ok(true) => Err(anyhow!("process exited with {status}")), + // The path doesn't exist; we're actually ok + Ok(false) => { + warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running"); + Ok(()) + }, + } + } + }) + // wrap any prior error with the overall context that we couldn't run the command + .with_context(|| { + format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`") + }) +} diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 898ad05add..e62f3b8a47 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -12,19 +12,23 @@ clap.workspace = true comfy-table.workspace = true futures.workspace = true git-version.workspace = true +humantime.workspace = true nix.workspace = true once_cell.workspace = true postgres.workspace = true hex.workspace = true +humantime-serde.workspace = true hyper.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } +scopeguard.workspace = true serde.workspace = true serde_json.workspace = true serde_with.workspace = true tar.workspace = true thiserror.workspace = true toml.workspace = true +toml_edit.workspace = true tokio.workspace = true tokio-postgres.workspace = true tokio-util.workspace = true diff --git a/control_plane/README.md b/control_plane/README.md new file mode 100644 index 0000000000..827aba5c1f --- /dev/null +++ b/control_plane/README.md @@ -0,0 +1,26 @@ +# Control Plane and Neon Local + +This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command. + +## Example: Start with Postgres 16 + +To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands. + +```shell +cargo neon init --pg-version 16 +cargo neon start +cargo neon tenant create --set-default --pg-version 16 +cargo neon endpoint create main --pg-version 16 +cargo neon endpoint start main +``` + +## Example: Create Test User and Database + +By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint. + +```shell +cargo neon endpoint create main --pg-version 16 --update-catalog true +cargo neon endpoint start main --create-test-user true +``` + +The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command. diff --git a/control_plane/src/attachment_service.rs b/control_plane/src/attachment_service.rs deleted file mode 100644 index 731c05809e..0000000000 --- a/control_plane/src/attachment_service.rs +++ /dev/null @@ -1,138 +0,0 @@ -use crate::{background_process, local_env::LocalEnv}; -use anyhow::anyhow; -use camino::Utf8PathBuf; -use serde::{Deserialize, Serialize}; -use std::{path::PathBuf, process::Child}; -use utils::id::{NodeId, TenantId}; - -pub struct AttachmentService { - env: LocalEnv, - listen: String, - path: PathBuf, - client: reqwest::Client, -} - -const COMMAND: &str = "attachment_service"; - -#[derive(Serialize, Deserialize)] -pub struct AttachHookRequest { - pub tenant_id: TenantId, - pub node_id: Option, -} - -#[derive(Serialize, Deserialize)] -pub struct AttachHookResponse { - pub gen: Option, -} - -#[derive(Serialize, Deserialize)] -pub struct InspectRequest { - pub tenant_id: TenantId, -} - -#[derive(Serialize, Deserialize)] -pub struct InspectResponse { - pub attachment: Option<(u32, NodeId)>, -} - -impl AttachmentService { - pub fn from_env(env: &LocalEnv) -> Self { - let path = env.base_data_dir.join("attachments.json"); - - // Makes no sense to construct this if pageservers aren't going to use it: assume - // pageservers have control plane API set - let listen_url = env.control_plane_api.clone().unwrap(); - - let listen = format!( - "{}:{}", - listen_url.host_str().unwrap(), - listen_url.port().unwrap() - ); - - Self { - env: env.clone(), - path, - listen, - client: reqwest::ClientBuilder::new() - .build() - .expect("Failed to construct http client"), - } - } - - fn pid_file(&self) -> Utf8PathBuf { - Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid")) - .expect("non-Unicode path") - } - - pub async fn start(&self) -> anyhow::Result { - let path_str = self.path.to_string_lossy(); - - background_process::start_process( - COMMAND, - &self.env.base_data_dir, - &self.env.attachment_service_bin(), - ["-l", &self.listen, "-p", &path_str], - [], - background_process::InitialPidFile::Create(self.pid_file()), - // TODO: a real status check - || async move { anyhow::Ok(true) }, - ) - .await - } - - pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { - background_process::stop_process(immediate, COMMAND, &self.pid_file()) - } - - /// Call into the attach_hook API, for use before handing out attachments to pageservers - pub async fn attach_hook( - &self, - tenant_id: TenantId, - pageserver_id: NodeId, - ) -> anyhow::Result> { - use hyper::StatusCode; - - let url = self - .env - .control_plane_api - .clone() - .unwrap() - .join("attach-hook") - .unwrap(); - - let request = AttachHookRequest { - tenant_id, - node_id: Some(pageserver_id), - }; - - let response = self.client.post(url).json(&request).send().await?; - if response.status() != StatusCode::OK { - return Err(anyhow!("Unexpected status {}", response.status())); - } - - let response = response.json::().await?; - Ok(response.gen) - } - - pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result> { - use hyper::StatusCode; - - let url = self - .env - .control_plane_api - .clone() - .unwrap() - .join("inspect") - .unwrap(); - - let request = InspectRequest { tenant_id }; - - let response = self.client.post(url).json(&request).send().await?; - if response.status() != StatusCode::OK { - return Err(anyhow!("Unexpected status {}", response.status())); - } - - let response = response.json::().await?; - Ok(response.attachment) - } -} diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 20fa3af9b8..3f4ddbdb2b 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -17,7 +17,7 @@ use std::io::Write; use std::os::unix::prelude::AsRawFd; use std::os::unix::process::CommandExt; use std::path::Path; -use std::process::{Child, Command}; +use std::process::Command; use std::time::Duration; use std::{fs, io, thread}; @@ -60,7 +60,7 @@ pub async fn start_process( envs: EI, initial_pid_file: InitialPidFile, process_status_check: F, -) -> anyhow::Result +) -> anyhow::Result<()> where F: Fn() -> Fut, Fut: std::future::Future>, @@ -69,10 +69,12 @@ where // Not generic AsRef, otherwise empty `envs` prevents type inference EI: IntoIterator, { + if !datadir.metadata().context("stat datadir")?.is_dir() { + anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}"); + } let log_path = datadir.join(format!("{process_name}.log")); let process_log_file = fs::OpenOptions::new() .create(true) - .write(true) .append(true) .open(&log_path) .with_context(|| { @@ -86,8 +88,17 @@ where let background_command = command .stdout(process_log_file) .stderr(same_file_for_stderr) - .args(args); - let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); + .args(args) + // spawn all child processes in their datadir, useful for all kinds of things, + // not least cleaning up child processes e.g. after an unclean exit from the test suite: + // ``` + // lsof -d cwd -a +D Users/cs/src/neon/test_output + // ``` + .current_dir(datadir); + + let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars( + fill_rust_env_vars(background_command), + )); filled_cmd.envs(envs); let pid_file_to_check = match &initial_pid_file { @@ -98,7 +109,7 @@ where InitialPidFile::Expect(path) => path, }; - let mut spawned_process = filled_cmd.spawn().with_context(|| { + let spawned_process = filled_cmd.spawn().with_context(|| { format!("Could not spawn {process_name}, see console output and log files for details.") })?; let pid = spawned_process.id(); @@ -106,12 +117,26 @@ where i32::try_from(pid) .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?, ); + // set up a scopeguard to kill & wait for the child in case we panic or bail below + let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| { + println!("SIGKILL & wait the started process"); + (|| { + // TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo). + spawned_process.kill().context("SIGKILL child")?; + spawned_process.wait().context("wait() for child process")?; + anyhow::Ok(()) + })() + .with_context(|| format!("scopeguard kill&wait child {process_name:?}")) + .unwrap(); + }); for retries in 0..RETRIES { match process_started(pid, pid_file_to_check, &process_status_check).await { Ok(true) => { - println!("\n{process_name} started, pid: {pid}"); - return Ok(spawned_process); + println!("\n{process_name} started and passed status check, pid: {pid}"); + // leak the child process, it'll outlive this neon_local invocation + drop(scopeguard::ScopeGuard::into_inner(spawned_process)); + return Ok(()); } Ok(false) => { if retries == NOTICE_AFTER_RETRIES { @@ -126,16 +151,15 @@ where thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS)); } Err(e) => { - println!("{process_name} failed to start: {e:#}"); - if let Err(e) = spawned_process.kill() { - println!("Could not stop {process_name} subprocess: {e:#}") - }; + println!("error starting process {process_name:?}: {e:#}"); return Err(e); } } } println!(); - anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds"); + anyhow::bail!( + "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds" + ); } /// Stops the process, using the pid file given. Returns Ok also if the process is already not running. @@ -243,7 +267,9 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { for env_key in [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", - "AWS_SESSION_TOKEN", + "AWS_PROFILE", + // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions. + "HOME", "AZURE_STORAGE_ACCOUNT", "AZURE_STORAGE_ACCESS_KEY", ] { @@ -254,6 +280,15 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { cmd } +fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command { + for (var, val) in std::env::vars() { + if var.starts_with("NEON_PAGESERVER_") { + cmd = cmd.env(var, val); + } + } + cmd +} + /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(), /// 1. Claims a pidfile with a fcntl lock on it and /// 2. Sets up the pidfile's file descriptor so that it (and the lock) @@ -280,7 +315,7 @@ where // is in state 'taken' but the thread that would unlock it is // not there. // 2. A rust object that represented some external resource in the - // parent now got implicitly copied by the the fork, even though + // parent now got implicitly copied by the fork, even though // the object's type is not `Copy`. The parent program may use // non-copyability as way to enforce unique ownership of an // external resource in the typesystem. The fork breaks that diff --git a/control_plane/src/bin/attachment_service.rs b/control_plane/src/bin/attachment_service.rs deleted file mode 100644 index e50c8fbba0..0000000000 --- a/control_plane/src/bin/attachment_service.rs +++ /dev/null @@ -1,337 +0,0 @@ -/// The attachment service mimics the aspects of the control plane API -/// that are required for a pageserver to operate. -/// -/// This enables running & testing pageservers without a full-blown -/// deployment of the Neon cloud platform. -/// -use anyhow::anyhow; -use clap::Parser; -use hex::FromHex; -use hyper::StatusCode; -use hyper::{Body, Request, Response}; -use pageserver_api::shard::TenantShardId; -use serde::{Deserialize, Serialize}; -use std::path::{Path, PathBuf}; -use std::{collections::HashMap, sync::Arc}; -use utils::http::endpoint::request_span; -use utils::logging::{self, LogFormat}; -use utils::signals::{ShutdownSignals, Signal}; - -use utils::{ - http::{ - endpoint::{self}, - error::ApiError, - json::{json_request, json_response}, - RequestExt, RouterBuilder, - }, - id::{NodeId, TenantId}, - tcp_listener, -}; - -use pageserver_api::control_api::{ - ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, - ValidateResponseTenant, -}; - -use control_plane::attachment_service::{ - AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, -}; - -#[derive(Parser)] -#[command(author, version, about, long_about = None)] -#[command(arg_required_else_help(true))] -struct Cli { - /// Host and port to listen on, like `127.0.0.1:1234` - #[arg(short, long)] - listen: std::net::SocketAddr, - - /// Path to the .json file to store state (will be created if it doesn't exist) - #[arg(short, long)] - path: PathBuf, -} - -// The persistent state of each Tenant -#[derive(Serialize, Deserialize, Clone)] -struct TenantState { - // Currently attached pageserver - pageserver: Option, - - // Latest generation number: next time we attach, increment this - // and use the incremented number when attaching - generation: u32, -} - -fn to_hex_map(input: &HashMap, serializer: S) -> Result -where - S: serde::Serializer, - V: Clone + Serialize, -{ - let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone())); - - transformed - .collect::>() - .serialize(serializer) -} - -fn from_hex_map<'de, D, V>(deserializer: D) -> Result, D::Error> -where - D: serde::de::Deserializer<'de>, - V: Deserialize<'de>, -{ - let hex_map = HashMap::::deserialize(deserializer)?; - hex_map - .into_iter() - .map(|(k, v)| { - TenantId::from_hex(k) - .map(|k| (k, v)) - .map_err(serde::de::Error::custom) - }) - .collect() -} - -// Top level state available to all HTTP handlers -#[derive(Serialize, Deserialize)] -struct PersistentState { - #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")] - tenants: HashMap, - - #[serde(skip)] - path: PathBuf, -} - -impl PersistentState { - async fn save(&self) -> anyhow::Result<()> { - let bytes = serde_json::to_vec(self)?; - tokio::fs::write(&self.path, &bytes).await?; - - Ok(()) - } - - async fn load(path: &Path) -> anyhow::Result { - let bytes = tokio::fs::read(path).await?; - let mut decoded = serde_json::from_slice::(&bytes)?; - decoded.path = path.to_owned(); - Ok(decoded) - } - - async fn load_or_new(path: &Path) -> Self { - match Self::load(path).await { - Ok(s) => { - tracing::info!("Loaded state file at {}", path.display()); - s - } - Err(e) - if e.downcast_ref::() - .map(|e| e.kind() == std::io::ErrorKind::NotFound) - .unwrap_or(false) => - { - tracing::info!("Will create state file at {}", path.display()); - Self { - tenants: HashMap::new(), - path: path.to_owned(), - } - } - Err(e) => { - panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display()) - } - } - } -} - -/// State available to HTTP request handlers -#[derive(Clone)] -struct State { - inner: Arc>, -} - -impl State { - fn new(persistent_state: PersistentState) -> State { - Self { - inner: Arc::new(tokio::sync::RwLock::new(persistent_state)), - } - } -} - -#[inline(always)] -fn get_state(request: &Request) -> &State { - request - .data::>() - .expect("unknown state type") - .as_ref() -} - -/// Pageserver calls into this on startup, to learn which tenants it should attach -async fn handle_re_attach(mut req: Request) -> Result, ApiError> { - let reattach_req = json_request::(&mut req).await?; - - let state = get_state(&req).inner.clone(); - let mut locked = state.write().await; - - let mut response = ReAttachResponse { - tenants: Vec::new(), - }; - for (t, state) in &mut locked.tenants { - if state.pageserver == Some(reattach_req.node_id) { - state.generation += 1; - response.tenants.push(ReAttachResponseTenant { - // TODO(sharding): make this shard-aware - id: TenantShardId::unsharded(*t), - gen: state.generation, - }); - } - } - - locked.save().await.map_err(ApiError::InternalServerError)?; - - json_response(StatusCode::OK, response) -} - -/// Pageserver calls into this before doing deletions, to confirm that it still -/// holds the latest generation for the tenants with deletions enqueued -async fn handle_validate(mut req: Request) -> Result, ApiError> { - let validate_req = json_request::(&mut req).await?; - - let locked = get_state(&req).inner.read().await; - - let mut response = ValidateResponse { - tenants: Vec::new(), - }; - - for req_tenant in validate_req.tenants { - // TODO(sharding): make this shard-aware - if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) { - let valid = tenant_state.generation == req_tenant.gen; - tracing::info!( - "handle_validate: {}(gen {}): valid={valid} (latest {})", - req_tenant.id, - req_tenant.gen, - tenant_state.generation - ); - response.tenants.push(ValidateResponseTenant { - id: req_tenant.id, - valid, - }); - } - } - - json_response(StatusCode::OK, response) -} -/// Call into this before attaching a tenant to a pageserver, to acquire a generation number -/// (in the real control plane this is unnecessary, because the same program is managing -/// generation numbers and doing attachments). -async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { - let attach_req = json_request::(&mut req).await?; - - let state = get_state(&req).inner.clone(); - let mut locked = state.write().await; - - let tenant_state = locked - .tenants - .entry(attach_req.tenant_id) - .or_insert_with(|| TenantState { - pageserver: attach_req.node_id, - generation: 0, - }); - - if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { - tenant_state.generation += 1; - tracing::info!( - tenant_id = %attach_req.tenant_id, - ps_id = %attaching_pageserver, - generation = %tenant_state.generation, - "issuing", - ); - } else if let Some(ps_id) = tenant_state.pageserver { - tracing::info!( - tenant_id = %attach_req.tenant_id, - %ps_id, - generation = %tenant_state.generation, - "dropping", - ); - } else { - tracing::info!( - tenant_id = %attach_req.tenant_id, - "no-op: tenant already has no pageserver"); - } - tenant_state.pageserver = attach_req.node_id; - let generation = tenant_state.generation; - - tracing::info!( - "handle_attach_hook: tenant {} set generation {}, pageserver {}", - attach_req.tenant_id, - tenant_state.generation, - attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) - ); - - locked.save().await.map_err(ApiError::InternalServerError)?; - - json_response( - StatusCode::OK, - AttachHookResponse { - gen: attach_req.node_id.map(|_| generation), - }, - ) -} - -async fn handle_inspect(mut req: Request) -> Result, ApiError> { - let inspect_req = json_request::(&mut req).await?; - - let state = get_state(&req).inner.clone(); - let locked = state.write().await; - let tenant_state = locked.tenants.get(&inspect_req.tenant_id); - - json_response( - StatusCode::OK, - InspectResponse { - attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))), - }, - ) -} - -fn make_router(persistent_state: PersistentState) -> RouterBuilder { - endpoint::make_router() - .data(Arc::new(State::new(persistent_state))) - .post("/re-attach", |r| request_span(r, handle_re_attach)) - .post("/validate", |r| request_span(r, handle_validate)) - .post("/attach-hook", |r| request_span(r, handle_attach_hook)) - .post("/inspect", |r| request_span(r, handle_inspect)) -} - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - logging::init( - LogFormat::Plain, - logging::TracingErrorLayerEnablement::Disabled, - logging::Output::Stdout, - )?; - - let args = Cli::parse(); - tracing::info!( - "Starting, state at {}, listening on {}", - args.path.to_string_lossy(), - args.listen - ); - - let persistent_state = PersistentState::load_or_new(&args.path).await; - - let http_listener = tcp_listener::bind(args.listen)?; - let router = make_router(persistent_state) - .build() - .map_err(|err| anyhow!(err))?; - let service = utils::http::RouterService::new(router).unwrap(); - let server = hyper::Server::from_tcp(http_listener)?.serve(service); - - tracing::info!("Serving on {0}", args.listen); - - tokio::task::spawn(server); - - ShutdownSignals::handle(|signal| match signal { - Signal::Interrupt | Signal::Terminate | Signal::Quit => { - tracing::info!("Got {}. Terminating", signal.name()); - // We're just a test helper: no graceful shutdown. - std::process::exit(0); - } - })?; - - Ok(()) -} diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 03e69010f7..8fe959792b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -6,21 +6,28 @@ //! rely on `neon_local` to set up the environment for each test. //! use anyhow::{anyhow, bail, Context, Result}; -use clap::{value_parser, Arg, ArgAction, ArgMatches, Command}; +use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum}; use compute_api::spec::ComputeMode; -use control_plane::attachment_service::AttachmentService; use control_plane::endpoint::ComputeControlPlane; -use control_plane::local_env::LocalEnv; -use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; +use control_plane::local_env::{ + InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, + SafekeeperConf, +}; +use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; -use control_plane::tenant_migration::migrate_tenant; +use control_plane::storage_controller::StorageController; use control_plane::{broker, local_env}; -use pageserver_api::models::TimelineInfo; -use pageserver_api::{ +use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, }; +use pageserver_api::controller_api::PlacementPolicy; +use pageserver_api::models::{ + ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, +}; +use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; +use postgres_connection::parse_host_port; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, @@ -30,6 +37,7 @@ use std::path::PathBuf; use std::process::exit; use std::str::FromStr; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; +use url::Host; use utils::{ auth::{Claims, Scope}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -45,45 +53,7 @@ project_git_version!(GIT_VERSION); const DEFAULT_PG_VERSION: &str = "15"; -const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/"; - -fn default_conf(num_pageservers: u16) -> String { - let mut template = format!( - r#" -# Default built-in configuration, defined in main.rs -control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}' - -[broker] -listen_addr = '{DEFAULT_BROKER_ADDR}' - -[[safekeepers]] -id = {DEFAULT_SAFEKEEPER_ID} -pg_port = {DEFAULT_SAFEKEEPER_PG_PORT} -http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT} - -"#, - ); - - for i in 0..num_pageservers { - let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); - let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; - let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; - - template += &format!( - r#" -[[pageservers]] -id = {pageserver_id} -listen_pg_addr = '127.0.0.1:{pg_port}' -listen_http_addr = '127.0.0.1:{http_port}' -pg_auth_type = '{trust_auth}' -http_auth_type = '{trust_auth}' -"#, - trust_auth = AuthType::Trust, - ) - } - - template -} +const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; /// /// Timelines tree element used as a value in the HashMap. @@ -117,7 +87,8 @@ fn main() -> Result<()> { handle_init(sub_args).map(Some) } else { // all other commands need an existing config - let mut env = LocalEnv::load_config().context("Error loading config")?; + let mut env = + LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?; let original_env = env.clone(); let rt = tokio::runtime::Builder::new_current_thread() @@ -128,10 +99,10 @@ fn main() -> Result<()> { let subcommand_result = match sub_name { "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)), "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)), - "start" => rt.block_on(handle_start_all(sub_args, &env)), - "stop" => handle_stop_all(sub_args, &env), + "start" => rt.block_on(handle_start_all(&env)), + "stop" => rt.block_on(handle_stop_all(sub_args, &env)), "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)), - "attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)), + "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)), "safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)), "endpoint" => rt.block_on(handle_endpoint(sub_args, &env)), "mappings" => handle_mappings(sub_args, &mut env), @@ -147,7 +118,7 @@ fn main() -> Result<()> { }; match subcommand_result { - Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(Some(updated_env)) => updated_env.persist_config()?, Ok(None) => (), Err(e) => { eprintln!("command failed: {e:?}"); @@ -276,10 +247,10 @@ fn print_timeline( /// Connects to the pageserver to query this information. async fn get_timeline_infos( env: &local_env::LocalEnv, - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, ) -> Result> { Ok(get_default_pageserver(env) - .timeline_list(tenant_id) + .timeline_list(tenant_shard_id) .await? .into_iter() .map(|timeline_info| (timeline_info.timeline_id, timeline_info)) @@ -297,6 +268,20 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R } } +// Helper function to parse --tenant_id option, for commands that accept a shard suffix +fn get_tenant_shard_id( + sub_match: &ArgMatches, + env: &local_env::LocalEnv, +) -> anyhow::Result { + if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() { + tenant_id_from_arguments + } else if let Some(default_id) = env.default_tenant_id { + Ok(TenantShardId::unsharded(default_id)) + } else { + anyhow::bail!("No tenant shard id. Use --tenant-id, or set a default tenant"); + } +} + fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .get_one::("tenant-id") @@ -305,6 +290,14 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { .context("Failed to parse tenant id from the argument string") } +fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .get_one::("tenant-id") + .map(|id_str| TenantShardId::from_str(id_str)) + .transpose() + .context("Failed to parse tenant shard id from the argument string") +} + fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { sub_match .get_one::("timeline-id") @@ -314,48 +307,66 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result anyhow::Result { - let num_pageservers = init_match - .get_one::("num-pageservers") - .expect("num-pageservers arg has a default"); - // Create config file - let toml_file: String = if let Some(config_path) = init_match.get_one::("config") { + let num_pageservers = init_match.get_one::("num-pageservers"); + + let force = init_match.get_one("force").expect("we set a default value"); + + // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`. + let init_conf: NeonLocalInitConf = if let Some(config_path) = + init_match.get_one::("config") + { + // User (likely the Python test suite) provided a description of the environment. + if num_pageservers.is_some() { + bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead"); + } // load and parse the file - std::fs::read_to_string(config_path).with_context(|| { + let contents = std::fs::read_to_string(config_path).with_context(|| { format!( "Could not read configuration file '{}'", config_path.display() ) - })? + })?; + toml_edit::de::from_str(&contents)? } else { - // Built-in default config - default_conf(*num_pageservers) + // User (likely interactive) did not provide a description of the environment, give them the default + NeonLocalInitConf { + control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())), + broker: NeonBroker { + listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(), + }, + safekeepers: vec![SafekeeperConf { + id: DEFAULT_SAFEKEEPER_ID, + pg_port: DEFAULT_SAFEKEEPER_PG_PORT, + http_port: DEFAULT_SAFEKEEPER_HTTP_PORT, + ..Default::default() + }], + pageservers: (0..num_pageservers.copied().unwrap_or(1)) + .map(|i| { + let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64); + let pg_port = DEFAULT_PAGESERVER_PG_PORT + i; + let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i; + NeonLocalInitPageserverConf { + id: pageserver_id, + listen_pg_addr: format!("127.0.0.1:{pg_port}"), + listen_http_addr: format!("127.0.0.1:{http_port}"), + pg_auth_type: AuthType::Trust, + http_auth_type: AuthType::Trust, + other: Default::default(), + } + }) + .collect(), + pg_distrib_dir: None, + neon_distrib_dir: None, + default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), + storage_controller: None, + control_plane_compute_hook_api: None, + } }; - let pg_version = init_match - .get_one::("pg-version") - .copied() - .context("Failed to parse postgres version from the argument string")?; - - let mut env = - LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?; - let force = init_match.get_flag("force"); - env.init(pg_version, force) - .context("Failed to initialize neon repository")?; - - // Create remote storage location for default LocalFs remote storage - std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - - // Initialize pageserver, create initial tenant and timeline. - for ps_conf in &env.pageservers { - PageServerNode::from_env(&env, ps_conf) - .initialize(&pageserver_config_overrides(init_match)) - .unwrap_or_else(|e| { - eprintln!("pageserver init failed: {e:?}"); - exit(1); - }); - } - - Ok(env) + LocalEnv::init(init_conf, force) + .context("materialize initial neon_local environment on disk")?; + Ok(LocalEnv::load_config(&local_env::base_path()) + .expect("freshly written config should be loadable")) } /// The default pageserver is the one where CLI tenant/timeline operations are sent by default. @@ -370,15 +381,6 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode { PageServerNode::from_env(env, ps_conf) } -fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { - init_match - .get_many::("pageserver-config-override") - .into_iter() - .flatten() - .map(String::as_str) - .collect() -} - async fn handle_tenant( tenant_match: &ArgMatches, env: &mut local_env::LocalEnv, @@ -390,50 +392,125 @@ async fn handle_tenant( println!("{} {:?}", t.id, t.state); } } + Some(("import", import_match)) => { + let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate); + + let storage_controller = StorageController::from_env(env); + let create_response = storage_controller.tenant_import(tenant_id).await?; + + let shard_zero = create_response + .shards + .first() + .expect("Import response omitted shards"); + + let attached_pageserver_id = shard_zero.node_id; + let pageserver = + PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?); + + println!( + "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}" + ); + + let timelines = pageserver + .http_client + .list_timelines(shard_zero.shard_id) + .await?; + + // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names + let main_timeline = timelines + .iter() + .find(|t| t.ancestor_timeline_id.is_none()) + .expect("No timelines found") + .timeline_id; + + let mut branch_i = 0; + for timeline in timelines.iter() { + let branch_name = if timeline.timeline_id == main_timeline { + "main".to_string() + } else { + branch_i += 1; + format!("branch_{branch_i}") + }; + + println!( + "Importing timeline {tenant_id}/{} as branch {branch_name}", + timeline.timeline_id + ); + + env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?; + } + } Some(("create", create_match)) => { let tenant_conf: HashMap<_, _> = create_match .get_many::("config") - .map(|vals| vals.flat_map(|c| c.split_once(':')).collect()) + .map(|vals: clap::parser::ValuesRef<'_, String>| { + vals.flat_map(|c| c.split_once(':')).collect() + }) .unwrap_or_default(); + let shard_count: u8 = create_match + .get_one::("shard-count") + .cloned() + .unwrap_or(0); + + let shard_stripe_size: Option = + create_match.get_one::("shard-stripe-size").cloned(); + + let placement_policy = match create_match.get_one::("placement-policy") { + Some(s) if !s.is_empty() => serde_json::from_str::(s)?, + _ => PlacementPolicy::Attached(0), + }; + + let tenant_conf = PageServerNode::parse_config(tenant_conf)?; + // If tenant ID was not specified, generate one let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate); - let generation = if env.control_plane_api.is_some() { - // We must register the tenant with the attachment service, so - // that when the pageserver restarts, it will be re-attached. - let attachment_service = AttachmentService::from_env(env); - attachment_service - .attach_hook(tenant_id, pageserver.conf.id) - .await? - } else { - None - }; - - pageserver - .tenant_create(tenant_id, generation, tenant_conf) + // We must register the tenant with the storage controller, so + // that when the pageserver restarts, it will be re-attached. + let storage_controller = StorageController::from_env(env); + storage_controller + .tenant_create(TenantCreateRequest { + // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the + // storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest + // type is used both in storage controller (for creating tenants) and in pageserver (for creating shards) + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation: None, + shard_parameters: ShardParameters { + count: ShardCount::new(shard_count), + stripe_size: shard_stripe_size + .map(ShardStripeSize) + .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE), + }, + placement_policy: Some(placement_policy), + config: tenant_conf, + }) .await?; println!("tenant {tenant_id} successfully created on the pageserver"); // Create an initial timeline for the new tenant - let new_timeline_id = parse_timeline_id(create_match)?; + let new_timeline_id = + parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate()); let pg_version = create_match .get_one::("pg-version") .copied() .context("Failed to parse postgres version from the argument string")?; - let timeline_info = pageserver - .timeline_create( + // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have + // different shards picking different start lsns. Maybe we have to teach storage controller + // to let shard 0 branch first and then propagate the chosen LSN to other shards. + storage_controller + .tenant_timeline_create( tenant_id, - new_timeline_id, - None, - None, - Some(pg_version), - None, + TimelineCreateRequest { + new_timeline_id, + ancestor_timeline_id: None, + ancestor_start_lsn: None, + existing_initdb_timeline_id: None, + pg_version: Some(pg_version), + }, ) .await?; - let new_timeline_id = timeline_info.timeline_id; - let last_record_lsn = timeline_info.last_record_lsn; env.register_branch_mapping( DEFAULT_BRANCH_NAME.to_string(), @@ -441,9 +518,7 @@ async fn handle_tenant( new_timeline_id, )?; - println!( - "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}", - ); + println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",); if create_match.get_flag("set-default") { println!("Setting tenant {tenant_id} as a default one"); @@ -469,14 +544,6 @@ async fn handle_tenant( .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; println!("tenant {tenant_id} successfully configured on the pageserver"); } - Some(("migrate", matches)) => { - let tenant_id = get_tenant_id(matches, env)?; - let new_pageserver = get_pageserver(env, matches)?; - let new_pageserver_id = new_pageserver.conf.id; - - migrate_tenant(env, tenant_id, new_pageserver).await?; - println!("tenant {tenant_id} migrated to {}", new_pageserver_id); - } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), @@ -489,8 +556,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local match timeline_match.subcommand() { Some(("list", list_match)) => { - let tenant_id = get_tenant_id(list_match, env)?; - let timelines = pageserver.timeline_list(&tenant_id).await?; + // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller + // where shard 0 is attached, and query there. + let tenant_shard_id = get_tenant_shard_id(list_match, env)?; + let timelines = pageserver.timeline_list(&tenant_shard_id).await?; print_timelines_tree(timelines, env.timeline_name_mappings())?; } Some(("create", create_match)) => { @@ -505,18 +574,19 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local .context("Failed to parse postgres version from the argument string")?; let new_timeline_id_opt = parse_timeline_id(create_match)?; + let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate()); - let timeline_info = pageserver - .timeline_create( - tenant_id, - new_timeline_id_opt, - None, - None, - Some(pg_version), - None, - ) + let storage_controller = StorageController::from_env(env); + let create_req = TimelineCreateRequest { + new_timeline_id, + ancestor_timeline_id: None, + existing_initdb_timeline_id: None, + ancestor_start_lsn: None, + pg_version: Some(pg_version), + }; + let timeline_info = storage_controller + .tenant_timeline_create(tenant_id, create_req) .await?; - let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info.last_record_lsn; env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; @@ -532,6 +602,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local let name = import_match .get_one::("node-name") .ok_or_else(|| anyhow!("No node name provided"))?; + let update_catalog = import_match + .get_one::("update-catalog") + .cloned() + .unwrap_or_default(); // Parse base inputs let base_tarfile = import_match @@ -574,7 +648,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local None, pg_version, ComputeMode::Primary, - DEFAULT_PAGESERVER_ID, + !update_catalog, )?; println!("Done"); } @@ -598,17 +672,18 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local .map(|lsn_str| Lsn::from_str(lsn_str)) .transpose() .context("Failed to parse ancestor start Lsn from the request")?; - let timeline_info = pageserver - .timeline_create( - tenant_id, - None, - start_lsn, - Some(ancestor_timeline_id), - None, - None, - ) + let new_timeline_id = TimelineId::generate(); + let storage_controller = StorageController::from_env(env); + let create_req = TimelineCreateRequest { + new_timeline_id, + ancestor_timeline_id: Some(ancestor_timeline_id), + existing_initdb_timeline_id: None, + ancestor_start_lsn: start_lsn, + pg_version: None, + }; + let timeline_info = storage_controller + .tenant_timeline_create(tenant_id, create_req) .await?; - let new_timeline_id = timeline_info.timeline_id; let last_record_lsn = timeline_info.last_record_lsn; @@ -635,8 +710,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re match sub_name { "list" => { - let tenant_id = get_tenant_id(sub_args, env)?; - let timeline_infos = get_timeline_infos(env, &tenant_id) + // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller + // where shard 0 is attached, and query there. + let tenant_shard_id = get_tenant_shard_id(sub_args, env)?; + let timeline_infos = get_timeline_infos(env, &tenant_shard_id) .await .unwrap_or_else(|e| { eprintln!("Failed to load timeline info: {}", e); @@ -661,7 +738,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re for (endpoint_id, endpoint) in cplane .endpoints .iter() - .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id) + .filter(|(_, endpoint)| endpoint.tenant_id == tenant_shard_id.tenant_id) { let lsn_str = match endpoint.mode { ComputeMode::Static(lsn) => { @@ -680,7 +757,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re }; let branch_name = timeline_name_mappings - .get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id)) + .get(&TenantTimelineId::new( + tenant_shard_id.tenant_id, + endpoint.timeline_id, + )) .map(|name| name.as_str()) .unwrap_or("?"); @@ -690,7 +770,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re &endpoint.timeline_id.to_string(), branch_name, lsn_str.as_str(), - endpoint.status(), + &format!("{}", endpoint.status()), ]); } @@ -706,6 +786,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .get_one::("endpoint_id") .map(String::to_string) .unwrap_or_else(|| format!("ep-{branch_name}")); + let update_catalog = sub_args + .get_one::("update-catalog") + .cloned() + .unwrap_or_default(); let lsn = sub_args .get_one::("lsn") @@ -728,12 +812,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .copied() .unwrap_or(false); - let pageserver_id = - if let Some(id_str) = sub_args.get_one::("endpoint-pageserver-id") { - NodeId(id_str.parse().context("while parsing pageserver id")?) - } else { - DEFAULT_PAGESERVER_ID - }; + let allow_multiple = sub_args.get_flag("allow-multiple"); let mode = match (lsn, hot_standby) { (Some(lsn), false) => ComputeMode::Static(lsn), @@ -752,7 +831,9 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re _ => {} } - cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + if !allow_multiple { + cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?; + } cplane.new_endpoint( &endpoint_id, @@ -762,7 +843,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re http_port, pg_version, mode, - pageserver_id, + !update_catalog, )?; } "start" => { @@ -772,13 +853,17 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re let pageserver_id = if let Some(id_str) = sub_args.get_one::("endpoint-pageserver-id") { - NodeId(id_str.parse().context("while parsing pageserver id")?) + Some(NodeId( + id_str.parse().context("while parsing pageserver id")?, + )) } else { - DEFAULT_PAGESERVER_ID + None }; let remote_ext_config = sub_args.get_one::("remote-ext-config"); + let allow_multiple = sub_args.get_flag("allow-multiple"); + // If --safekeepers argument is given, use only the listed safekeeper nodes. let safekeepers = if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { @@ -799,13 +884,51 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .get(endpoint_id.as_str()) .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?; - cplane.check_conflicting_endpoints( - endpoint.mode, - endpoint.tenant_id, - endpoint.timeline_id, - )?; + let create_test_user = sub_args + .get_one::("create-test-user") + .cloned() + .unwrap_or_default(); - let ps_conf = env.get_pageserver_conf(pageserver_id)?; + if !allow_multiple { + cplane.check_conflicting_endpoints( + endpoint.mode, + endpoint.tenant_id, + endpoint.timeline_id, + )?; + } + + let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id { + let conf = env.get_pageserver_conf(pageserver_id).unwrap(); + let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config"); + ( + vec![(parsed.0, parsed.1.unwrap_or(5432))], + // If caller is telling us what pageserver to use, this is not a tenant which is + // full managed by storage controller, therefore not sharded. + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + } else { + // Look up the currently attached location of the tenant, and its striping metadata, + // to pass these on to postgres. + let storage_controller = StorageController::from_env(env); + let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?; + let pageservers = locate_result + .shards + .into_iter() + .map(|shard| { + ( + Host::parse(&shard.listen_pg_addr) + .expect("Storage controller reported bad hostname"), + shard.listen_pg_port, + ) + }) + .collect::>(); + let stripe_size = locate_result.shard_params.stripe_size; + + (pageservers, stripe_size) + }; + assert!(!pageservers.is_empty()); + + let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?; let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) { let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant); @@ -816,7 +939,14 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re println!("Starting existing endpoint {endpoint_id}..."); endpoint - .start(&auth_token, safekeepers, remote_ext_config) + .start( + &auth_token, + safekeepers, + pageservers, + remote_ext_config, + stripe_size.0 as usize, + create_test_user, + ) .await?; } "reconfigure" => { @@ -827,27 +957,44 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - let pageserver_id = + let pageservers = if let Some(id_str) = sub_args.get_one::("endpoint-pageserver-id") { - Some(NodeId( - id_str.parse().context("while parsing pageserver id")?, - )) + let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?); + let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?); + vec![( + pageserver.pg_connection_config.host().clone(), + pageserver.pg_connection_config.port(), + )] } else { - None + let storage_controller = StorageController::from_env(env); + storage_controller + .tenant_locate(endpoint.tenant_id) + .await? + .shards + .into_iter() + .map(|shard| { + ( + Host::parse(&shard.listen_pg_addr) + .expect("Storage controller reported malformed host"), + shard.listen_pg_port, + ) + }) + .collect::>() }; - endpoint.reconfigure(pageserver_id).await?; + endpoint.reconfigure(pageservers, None).await?; } "stop" => { let endpoint_id = sub_args .get_one::("endpoint_id") .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?; let destroy = sub_args.get_flag("destroy"); + let mode = sub_args.get_one::("mode").expect("has a default"); let endpoint = cplane .endpoints .get(endpoint_id.as_str()) .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?; - endpoint.stop(destroy)?; + endpoint.stop(mode, destroy)?; } _ => bail!("Unexpected endpoint subcommand '{sub_name}'"), @@ -904,10 +1051,7 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result Result<()> { match sub_match.subcommand() { Some(("start", subcommand_args)) => { - if let Err(e) = get_pageserver(env, subcommand_args)? - .start(&pageserver_config_overrides(subcommand_args)) - .await - { + if let Err(e) = get_pageserver(env, subcommand_args)?.start().await { eprintln!("pageserver start failed: {e}"); exit(1); } @@ -933,27 +1077,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> exit(1); } - if let Err(e) = pageserver - .start(&pageserver_config_overrides(subcommand_args)) - .await - { - eprintln!("pageserver start failed: {e}"); - exit(1); - } - } - - Some(("migrate", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; - //TODO what shutdown strategy should we use here? - if let Err(e) = pageserver.stop(false) { - eprintln!("pageserver stop failed: {}", e); - exit(1); - } - - if let Err(e) = pageserver - .start(&pageserver_config_overrides(subcommand_args)) - .await - { + if let Err(e) = pageserver.start().await { eprintln!("pageserver start failed: {e}"); exit(1); } @@ -975,11 +1099,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } -async fn handle_attachment_service( +async fn handle_storage_controller( sub_match: &ArgMatches, env: &local_env::LocalEnv, ) -> Result<()> { - let svc = AttachmentService::from_env(env); + let svc = StorageController::from_env(env); match sub_match.subcommand() { Some(("start", _start_match)) => { if let Err(e) = svc.start().await { @@ -994,13 +1118,13 @@ async fn handle_attachment_service( .map(|s| s.as_str()) == Some("immediate"); - if let Err(e) = svc.stop(immediate) { + if let Err(e) = svc.stop(immediate).await { eprintln!("stop failed: {}", e); exit(1); } } - Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name), - None => bail!("no attachment_service subcommand provided"), + Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name), + None => bail!("no storage_controller subcommand provided"), } Ok(()) } @@ -1080,29 +1204,26 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Ok(()) } -async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> { +async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> { // Endpoints are not started automatically broker::start_broker_process(env).await?; - // Only start the attachment service if the pageserver is configured to need it + // Only start the storage controller if the pageserver is configured to need it if env.control_plane_api.is_some() { - let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.start().await { - eprintln!("attachment_service start failed: {:#}", e); - try_stop_all(env, true); + let storage_controller = StorageController::from_env(env); + if let Err(e) = storage_controller.start().await { + eprintln!("storage_controller start failed: {:#}", e); + try_stop_all(env, true).await; exit(1); } } for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); - if let Err(e) = pageserver - .start(&pageserver_config_overrides(sub_match)) - .await - { + if let Err(e) = pageserver.start().await { eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e); - try_stop_all(env, true); + try_stop_all(env, true).await; exit(1); } } @@ -1111,28 +1232,28 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start(vec![]).await { eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e); - try_stop_all(env, false); + try_stop_all(env, false).await; exit(1); } } Ok(()) } -fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +async fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let immediate = sub_match.get_one::("stop-mode").map(|s| s.as_str()) == Some("immediate"); - try_stop_all(env, immediate); + try_stop_all(env, immediate).await; Ok(()) } -fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { +async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { // Stop all endpoints match ComputeControlPlane::load(env.clone()) { Ok(cplane) => { for (_k, node) in cplane.endpoints { - if let Err(e) = node.stop(false) { + if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) { eprintln!("postgres stop failed: {e:#}"); } } @@ -1161,9 +1282,9 @@ fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { } if env.control_plane_api.is_some() { - let attachment_service = AttachmentService::from_env(env); - if let Err(e) = attachment_service.stop(immediate) { - eprintln!("attachment service stop failed: {e:#}"); + let storage_controller = StorageController::from_env(env); + if let Err(e) = storage_controller.stop(immediate).await { + eprintln!("storage controller stop failed: {e:#}"); } } } @@ -1241,13 +1362,6 @@ fn cli() -> Command { .required(false) .value_name("stop-mode"); - let pageserver_config_args = Arg::new("pageserver-config-override") - .long("pageserver-config-override") - .num_args(1) - .action(ArgAction::Append) - .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") - .required(false); - let remote_ext_config_args = Arg::new("remote-ext-config") .long("remote-ext-config") .num_args(1) @@ -1266,18 +1380,40 @@ fn cli() -> Command { .required(false); let force_arg = Arg::new("force") - .value_parser(value_parser!(bool)) + .value_parser(value_parser!(InitForceMode)) .long("force") - .action(ArgAction::SetTrue) + .default_value( + InitForceMode::MustNotExist + .to_possible_value() + .unwrap() + .get_name() + .to_owned(), + ) .help("Force initialization even if the repository is not empty") .required(false); let num_pageservers_arg = Arg::new("num-pageservers") .value_parser(value_parser!(u16)) .long("num-pageservers") - .help("How many pageservers to create (default 1)") - .required(false) - .default_value("1"); + .help("How many pageservers to create (default 1)"); + + let update_catalog = Arg::new("update-catalog") + .value_parser(value_parser!(bool)) + .long("update-catalog") + .help("If set, will set up the catalog for neon_superuser") + .required(false); + + let create_test_user = Arg::new("create-test-user") + .value_parser(value_parser!(bool)) + .long("create-test-user") + .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`") + .required(false); + + let allow_multiple = Arg::new("allow-multiple") + .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.") + .long("allow-multiple") + .action(ArgAction::SetTrue) + .required(false); Command::new("Neon CLI") .arg_required_else_help(true) @@ -1285,14 +1421,13 @@ fn cli() -> Command { .subcommand( Command::new("init") .about("Initialize a new Neon repository, preparing configs for services to start with") - .arg(pageserver_config_args.clone()) .arg(num_pageservers_arg.clone()) .arg( Arg::new("config") .long("config") .required(false) .value_parser(value_parser!(PathBuf)) - .value_name("config"), + .value_name("config") ) .arg(pg_version_arg.clone()) .arg(force_arg) @@ -1300,6 +1435,7 @@ fn cli() -> Command { .subcommand( Command::new("timeline") .about("Manage timelines") + .arg_required_else_help(true) .subcommand(Command::new("list") .about("List all timelines, available to this pageserver") .arg(tenant_id_arg.clone())) @@ -1339,6 +1475,7 @@ fn cli() -> Command { .arg(Arg::new("end-lsn").long("end-lsn") .help("Lsn the basebackup ends at")) .arg(pg_version_arg.clone()) + .arg(update_catalog.clone()) ) ).subcommand( Command::new("tenant") @@ -1352,16 +1489,17 @@ fn cli() -> Command { .arg(pg_version_arg.clone()) .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false) .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified")) + .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)")) + .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages")) + .arg(Arg::new("placement-policy").value_parser(value_parser!(String)).long("placement-policy").action(ArgAction::Set).help("Placement policy shards in this tenant")) ) .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true)) .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified")) .subcommand(Command::new("config") .arg(tenant_id_arg.clone()) .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false))) - .subcommand(Command::new("migrate") - .about("Migrate a tenant from one pageserver to another") - .arg(tenant_id_arg.clone()) - .arg(pageserver_id_arg.clone())) + .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true)) + .about("Import a tenant that is present in remote storage, and create branches for its timelines")) ) .subcommand( Command::new("pageserver") @@ -1371,7 +1509,6 @@ fn cli() -> Command { .subcommand(Command::new("status")) .subcommand(Command::new("start") .about("Start local pageserver") - .arg(pageserver_config_args.clone()) ) .subcommand(Command::new("stop") .about("Stop local pageserver") @@ -1379,15 +1516,14 @@ fn cli() -> Command { ) .subcommand(Command::new("restart") .about("Restart local pageserver") - .arg(pageserver_config_args.clone()) ) ) .subcommand( - Command::new("attachment_service") + Command::new("storage_controller") .arg_required_else_help(true) - .about("Manage attachment_service") - .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone())) - .subcommand(Command::new("stop").about("Stop local pageserver") + .about("Manage storage_controller") + .subcommand(Command::new("start").about("Start storage controller")) + .subcommand(Command::new("stop").about("Stop storage controller") .arg(stop_mode_arg.clone())) ) .subcommand( @@ -1432,6 +1568,8 @@ fn cli() -> Command { .required(false)) .arg(pg_version_arg.clone()) .arg(hot_standby_arg.clone()) + .arg(update_catalog) + .arg(allow_multiple.clone()) ) .subcommand(Command::new("start") .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") @@ -1439,6 +1577,8 @@ fn cli() -> Command { .arg(endpoint_pageserver_id_arg.clone()) .arg(safekeepers_arg) .arg(remote_ext_config_args) + .arg(create_test_user) + .arg(allow_multiple.clone()) ) .subcommand(Command::new("reconfigure") .about("Reconfigure the endpoint") @@ -1455,7 +1595,16 @@ fn cli() -> Command { .long("destroy") .action(ArgAction::SetTrue) .required(false) - ) + ) + .arg( + Arg::new("mode") + .help("Postgres shutdown mode, passed to \"pg_ctl -m \"") + .long("mode") + .action(ArgAction::Set) + .required(false) + .value_parser(["smart", "fast", "immediate"]) + .default_value("fast") + ) ) ) @@ -1481,7 +1630,6 @@ fn cli() -> Command { .subcommand( Command::new("start") .about("Start page server and safekeepers") - .arg(pageserver_config_args) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 3d5dfd6311..20371e1cb8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -12,7 +12,7 @@ //! //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads -//! the basebackup from the pageserver to initialize the the data directory, and +//! the basebackup from the pageserver to initialize the data directory, and //! finally launches the PostgreSQL process. It watches the PostgreSQL process //! until it exits. //! @@ -41,22 +41,28 @@ use std::net::SocketAddr; use std::net::TcpStream; use std::path::PathBuf; use std::process::Command; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use anyhow::{anyhow, bail, Context, Result}; +use compute_api::spec::Database; +use compute_api::spec::PgIdent; use compute_api::spec::RemoteExtSpec; +use compute_api::spec::Role; use nix::sys::signal::kill; use nix::sys::signal::Signal; +use pageserver_api::shard::ShardStripeSize; use serde::{Deserialize, Serialize}; +use url::Host; use utils::id::{NodeId, TenantId, TimelineId}; use crate::local_env::LocalEnv; -use crate::pageserver::PageServerNode; use crate::postgresql_conf::PostgresConf; +use crate::storage_controller::StorageController; use compute_api::responses::{ComputeState, ComputeStatus}; -use compute_api::spec::{Cluster, ComputeMode, ComputeSpec}; +use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec}; // contents of a endpoint.json file #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] @@ -69,7 +75,7 @@ pub struct EndpointConf { http_port: u16, pg_version: u32, skip_pg_catalog_updates: bool, - pageserver_id: NodeId, + features: Vec, } // @@ -121,19 +127,15 @@ impl ComputeControlPlane { http_port: Option, pg_version: u32, mode: ComputeMode, - pageserver_id: NodeId, + skip_pg_catalog_updates: bool, ) -> Result> { let pg_port = pg_port.unwrap_or_else(|| self.get_port()); let http_port = http_port.unwrap_or_else(|| self.get_port() + 1); - let pageserver = - PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?); - let ep = Arc::new(Endpoint { endpoint_id: endpoint_id.to_owned(), pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port), http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port), env: self.env.clone(), - pageserver, timeline_id, mode, tenant_id, @@ -144,7 +146,8 @@ impl ComputeControlPlane { // before and after start are the same. So, skip catalog updates, // with this we basically test a case of waking up an idle compute, where // we also skip catalog updates in the cloud. - skip_pg_catalog_updates: true, + skip_pg_catalog_updates, + features: vec![], }); ep.create_endpoint_dir()?; @@ -158,8 +161,8 @@ impl ComputeControlPlane { http_port, pg_port, pg_version, - skip_pg_catalog_updates: true, - pageserver_id, + skip_pg_catalog_updates, + features: vec![], })?, )?; std::fs::write( @@ -187,7 +190,7 @@ impl ComputeControlPlane { v.tenant_id == tenant_id && v.timeline_id == timeline_id && v.mode == mode - && v.status() != "stopped" + && v.status() != EndpointStatus::Stopped }); if let Some((key, _)) = duplicates.next() { @@ -218,10 +221,32 @@ pub struct Endpoint { // These are not part of the endpoint as such, but the environment // the endpoint runs in. pub env: LocalEnv, - pageserver: PageServerNode, // Optimizations skip_pg_catalog_updates: bool, + + // Feature flags + features: Vec, +} + +#[derive(PartialEq, Eq)] +pub enum EndpointStatus { + Running, + Stopped, + Crashed, + RunningNoPidfile, +} + +impl std::fmt::Display for EndpointStatus { + fn fmt(&self, writer: &mut std::fmt::Formatter) -> std::fmt::Result { + let s = match self { + Self::Running => "running", + Self::Stopped => "stopped", + Self::Crashed => "crashed", + Self::RunningNoPidfile => "running, no pidfile", + }; + write!(writer, "{}", s) + } } impl Endpoint { @@ -241,20 +266,17 @@ impl Endpoint { let conf: EndpointConf = serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?; - let pageserver = - PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?); - Ok(Endpoint { pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port), http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port), endpoint_id, env: env.clone(), - pageserver, timeline_id: conf.timeline_id, mode: conf.mode, tenant_id: conf.tenant_id, pg_version: conf.pg_version, skip_pg_catalog_updates: conf.skip_pg_catalog_updates, + features: conf.features, }) } @@ -384,16 +406,16 @@ impl Endpoint { self.endpoint_path().join("pgdata") } - pub fn status(&self) -> &str { + pub fn status(&self) -> EndpointStatus { let timeout = Duration::from_millis(300); let has_pidfile = self.pgdata().join("postmaster.pid").exists(); let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok(); match (has_pidfile, can_connect) { - (true, true) => "running", - (false, false) => "stopped", - (true, false) => "crashed", - (false, true) => "running, no pidfile", + (true, true) => EndpointStatus::Running, + (false, false) => EndpointStatus::Stopped, + (true, false) => EndpointStatus::Crashed, + (false, true) => EndpointStatus::RunningNoPidfile, } } @@ -442,7 +464,7 @@ impl Endpoint { } fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> { - // TODO use background_process::stop_process instead + // TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482 let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?; let pid = nix::unistd::Pid::from_raw(pid as i32); @@ -469,13 +491,24 @@ impl Endpoint { } } + fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String { + pageservers + .iter() + .map(|(host, port)| format!("postgresql://no_user@{host}:{port}")) + .collect::>() + .join(",") + } + pub async fn start( &self, auth_token: &Option, safekeepers: Vec, + pageservers: Vec<(Host, u16)>, remote_ext_config: Option<&String>, + shard_stripe_size: usize, + create_test_user: bool, ) -> Result<()> { - if self.status() == "running" { + if self.status() == EndpointStatus::Running { anyhow::bail!("The endpoint is already running"); } @@ -487,13 +520,9 @@ impl Endpoint { std::fs::remove_dir_all(self.pgdata())?; } - let pageserver_connstring = { - let config = &self.pageserver.pg_connection_config; - let (host, port) = (config.host(), config.port()); + let pageserver_connstring = Self::build_pageserver_connstr(&pageservers); + assert!(!pageserver_connstring.is_empty()); - // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere. - format!("postgresql://no_user@{host}:{port}") - }; let mut safekeeper_connstrings = Vec::new(); if self.mode == ComputeMode::Primary { for sk_id in safekeepers { @@ -524,13 +553,32 @@ impl Endpoint { skip_pg_catalog_updates: self.skip_pg_catalog_updates, format_version: 1.0, operation_uuid: None, - features: vec![], + features: self.features.clone(), + swap_size_bytes: None, cluster: Cluster { cluster_id: None, // project ID: not used name: None, // project name: not used state: None, - roles: vec![], - databases: vec![], + roles: if create_test_user { + vec![Role { + name: PgIdent::from_str("test").unwrap(), + encrypted_password: None, + options: None, + }] + } else { + Vec::new() + }, + databases: if create_test_user { + vec![Database { + name: PgIdent::from_str("neondb").unwrap(), + owner: PgIdent::from_str("test").unwrap(), + options: None, + restrict_conn: false, + invalid: false, + }] + } else { + Vec::new() + }, settings: None, postgresql_conf: Some(postgresql_conf), }, @@ -543,6 +591,8 @@ impl Endpoint { storage_auth_token: auth_token.clone(), remote_extensions, pgbouncer_settings: None, + shard_stripe_size: Some(shard_stripe_size), + primary_is_running: None, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -554,11 +604,16 @@ impl Endpoint { .open(self.endpoint_path().join("compute.log"))?; // Launch compute_ctl - println!("Starting postgres node at '{}'", self.connstr()); + let conn_str = self.connstr("cloud_admin", "postgres"); + println!("Starting postgres node at '{}'", conn_str); + if create_test_user { + let conn_str = self.connstr("test", "neondb"); + println!("Also at '{}'", conn_str); + } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); cmd.args(["--http-port", &self.http_address.port().to_string()]) .args(["--pgdata", self.pgdata().to_str().unwrap()]) - .args(["--connstr", &self.connstr()]) + .args(["--connstr", &conn_str]) .args([ "--spec-path", self.endpoint_path().join("spec.json").to_str().unwrap(), @@ -580,9 +635,21 @@ impl Endpoint { } let child = cmd.spawn()?; + // set up a scopeguard to kill & wait for the child in case we panic or bail below + let child = scopeguard::guard(child, |mut child| { + println!("SIGKILL & wait the started process"); + (|| { + // TODO: use another signal that can be caught by the child so it can clean up any children it spawned + child.kill().context("SIGKILL child")?; + child.wait().context("wait() for child process")?; + anyhow::Ok(()) + })() + .with_context(|| format!("scopeguard kill&wait child {child:?}")) + .unwrap(); + }); // Write down the pid so we can wait for it when we want to stop - // TODO use background_process::start_process instead + // TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482 let pid = child.id(); let pidfile_path = self.endpoint_path().join("compute_ctl.pid"); std::fs::write(pidfile_path, pid.to_string())?; @@ -590,7 +657,7 @@ impl Endpoint { // Wait for it to start let mut attempt = 0; const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100); - const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s + const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min loop { attempt += 1; match self.get_status().await { @@ -617,7 +684,9 @@ impl Endpoint { } ComputeStatus::Empty | ComputeStatus::ConfigurationPending - | ComputeStatus::Configuration => { + | ComputeStatus::Configuration + | ComputeStatus::TerminationPending + | ComputeStatus::Terminated => { bail!("unexpected compute status: {:?}", state.status) } } @@ -631,6 +700,9 @@ impl Endpoint { std::thread::sleep(ATTEMPT_INTERVAL); } + // disarm the scopeguard, let the child outlive this function (and neon_local invoction) + drop(scopeguard::ScopeGuard::into_inner(child)); + Ok(()) } @@ -665,7 +737,11 @@ impl Endpoint { } } - pub async fn reconfigure(&self, pageserver_id: Option) -> Result<()> { + pub async fn reconfigure( + &self, + mut pageservers: Vec<(Host, u16)>, + stripe_size: Option, + ) -> Result<()> { let mut spec: ComputeSpec = { let spec_path = self.endpoint_path().join("spec.json"); let file = std::fs::File::open(spec_path)?; @@ -675,26 +751,34 @@ impl Endpoint { let postgresql_conf = self.read_postgresql_conf()?; spec.cluster.postgresql_conf = Some(postgresql_conf); - if let Some(pageserver_id) = pageserver_id { - let endpoint_config_path = self.endpoint_path().join("endpoint.json"); - let mut endpoint_conf: EndpointConf = { - let file = std::fs::File::open(&endpoint_config_path)?; - serde_json::from_reader(file)? - }; - endpoint_conf.pageserver_id = pageserver_id; - std::fs::write( - endpoint_config_path, - serde_json::to_string_pretty(&endpoint_conf)?, - )?; - - let pageserver = - PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?); - let ps_http_conf = &pageserver.pg_connection_config; - let (host, port) = (ps_http_conf.host(), ps_http_conf.port()); - spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}")); + // If we weren't given explicit pageservers, query the storage controller + if pageservers.is_empty() { + let storage_controller = StorageController::from_env(&self.env); + let locate_result = storage_controller.tenant_locate(self.tenant_id).await?; + pageservers = locate_result + .shards + .into_iter() + .map(|shard| { + ( + Host::parse(&shard.listen_pg_addr) + .expect("Storage controller reported bad hostname"), + shard.listen_pg_port, + ) + }) + .collect::>(); } - let client = reqwest::Client::new(); + let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); + assert!(!pageserver_connstr.is_empty()); + spec.pageserver_connstring = Some(pageserver_connstr); + if stripe_size.is_some() { + spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + } + + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(30)) + .build() + .unwrap(); let response = client .post(format!( "http://{}:{}/configure", @@ -721,22 +805,8 @@ impl Endpoint { } } - pub fn stop(&self, destroy: bool) -> Result<()> { - // If we are going to destroy data directory, - // use immediate shutdown mode, otherwise, - // shutdown gracefully to leave the data directory sane. - // - // Postgres is always started from scratch, so stop - // without destroy only used for testing and debugging. - // - self.pg_ctl( - if destroy { - &["-m", "immediate", "stop"] - } else { - &["stop"] - }, - &None, - )?; + pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> { + self.pg_ctl(&["-m", mode, "stop"], &None)?; // Also wait for the compute_ctl process to die. It might have some // cleanup work to do after postgres stops, like syncing safekeepers, @@ -757,13 +827,13 @@ impl Endpoint { Ok(()) } - pub fn connstr(&self) -> String { + pub fn connstr(&self, user: &str, db_name: &str) -> String { format!( "postgresql://{}@{}:{}/{}", - "cloud_admin", + user, self.pg_address.ip(), self.pg_address.port(), - "postgres" + db_name ) } } diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 52a0e20429..2af272f388 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -6,7 +6,6 @@ //! local installations. #![deny(clippy::undocumented_unsafe_blocks)] -pub mod attachment_service; mod background_process; pub mod broker; pub mod endpoint; @@ -14,4 +13,4 @@ pub mod local_env; pub mod pageserver; pub mod postgresql_conf; pub mod safekeeper; -pub mod tenant_migration; +pub mod storage_controller; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index b9c8aeddcb..6634274d2a 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,8 +3,9 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, ensure, Context}; +use anyhow::{bail, Context}; +use clap::ValueEnum; use postgres_backend::AuthType; use reqwest::Url; use serde::{Deserialize, Serialize}; @@ -16,11 +17,14 @@ use std::net::Ipv4Addr; use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; +use std::time::Duration; use utils::{ auth::{encode_from_key_file, Claims}, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, }; +use crate::pageserver::PageServerNode; +use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR; use crate::safekeeper::SafekeeperNode; pub const DEFAULT_PG_VERSION: u32 = 15; @@ -32,58 +36,107 @@ pub const DEFAULT_PG_VERSION: u32 = 15; // to 'neon_local init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute endpoints). // // This is not stored in the config file. Rather, this is the path where the - // config file itself is. It is read from the NEON_REPO_DIR env variable or - // '.neon' if not given. - #[serde(skip)] + // config file itself is. It is read from the NEON_REPO_DIR env variable which + // must be an absolute path. If the env var is not set, $PWD/.neon is used. pub base_data_dir: PathBuf, // Path to postgres distribution. It's expected that "bin", "include", // "lib", "share" from postgres distribution are there. If at some point // in time we will be able to run against vanilla postgres we may split that // to four separate paths and match OS-specific installation layout. - #[serde(default)] pub pg_distrib_dir: PathBuf, // Path to pageserver binary. - #[serde(default)] pub neon_distrib_dir: PathBuf, // Default tenant ID to use with the 'neon_local' command line utility, when // --tenant_id is not explicitly specified. - #[serde(default)] pub default_tenant_id: Option, // used to issue tokens during e.g pg start - #[serde(default)] pub private_key_path: PathBuf, pub broker: NeonBroker, + // Configuration for the storage controller (1 per neon_local environment) + pub storage_controller: NeonStorageControllerConf, + /// This Vec must always contain at least one pageserver + /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s. + /// NB: not used anymore except for informing users that they need to change their `.neon/config`. pub pageservers: Vec, - #[serde(default)] pub safekeepers: Vec, - // Control plane location: if None, we will not run attachment_service. If set, this will + // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. - #[serde(default)] pub control_plane_api: Option, + // Control plane upcall API for storage controller. If set, this will be propagated into the + // storage controller's configuration. + pub control_plane_compute_hook_api: Option, + /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. - #[serde(default)] // A `HashMap>` would be more appropriate here, // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + pub branch_name_mappings: HashMap>, +} + +/// On-disk state stored in `.neon/config`. +#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] +pub struct OnDiskConfig { + pub pg_distrib_dir: PathBuf, + pub neon_distrib_dir: PathBuf, + pub default_tenant_id: Option, + pub private_key_path: PathBuf, + pub broker: NeonBroker, + pub storage_controller: NeonStorageControllerConf, + #[serde( + skip_serializing, + deserialize_with = "fail_if_pageservers_field_specified" + )] + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option, + pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, } +fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + Err(serde::de::Error::custom( + "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \ + Please remove the `pageservers` from your .neon/config.", + )) +} + +/// The description of the neon_local env to be initialized by `neon_local init --config`. +#[derive(Clone, Debug, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct NeonLocalInitConf { + // TODO: do we need this? Seems unused + pub pg_distrib_dir: Option, + // TODO: do we need this? Seems unused + pub neon_distrib_dir: Option, + pub default_tenant_id: TenantId, + pub broker: NeonBroker, + pub storage_controller: Option, + pub pageservers: Vec, + pub safekeepers: Vec, + pub control_plane_api: Option>, + pub control_plane_compute_hook_api: Option>, +} + /// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] @@ -92,6 +145,33 @@ pub struct NeonBroker { pub listen_addr: SocketAddr, } +/// Broker config for cluster internal communication. +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] +#[serde(default)] +pub struct NeonStorageControllerConf { + /// Heartbeat timeout before marking a node offline + #[serde(with = "humantime_serde")] + pub max_unavailable: Duration, + + /// Threshold for auto-splitting a tenant into shards + pub split_threshold: Option, +} + +impl NeonStorageControllerConf { + // Use a shorter pageserver unavailability interval than the default to speed up tests. + const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = + std::time::Duration::from_secs(10); +} + +impl Default for NeonStorageControllerConf { + fn default() -> Self { + Self { + max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL, + split_threshold: None, + } + } +} + // Dummy Default impl to satisfy Deserialize derive. impl Default for NeonBroker { fn default() -> Self { @@ -107,17 +187,16 @@ impl NeonBroker { } } +// neon_local needs to know this subset of pageserver configuration. +// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`. +// It can get stale if `pageserver.toml` is changed. +// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml` #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] -#[serde(default)] +#[serde(default, deny_unknown_fields)] pub struct PageServerConf { - // node id pub id: NodeId, - - // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, - - // auth type used for the PG and HTTP ports pub pg_auth_type: AuthType, pub http_auth_type: AuthType, } @@ -134,6 +213,40 @@ impl Default for PageServerConf { } } +/// The toml that can be passed to `neon_local init --config`. +/// This is a subset of the `pageserver.toml` configuration. +// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +pub struct NeonLocalInitPageserverConf { + pub id: NodeId, + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub pg_auth_type: AuthType, + pub http_auth_type: AuthType, + #[serde(flatten)] + pub other: HashMap, +} + +impl From<&NeonLocalInitPageserverConf> for PageServerConf { + fn from(conf: &NeonLocalInitPageserverConf) -> Self { + let NeonLocalInitPageserverConf { + id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + other: _, + } = conf; + Self { + id: *id, + listen_pg_addr: listen_pg_addr.clone(), + listen_http_addr: listen_http_addr.clone(), + pg_auth_type: *pg_auth_type, + http_auth_type: *http_auth_type, + } + } +} + #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { @@ -145,6 +258,7 @@ pub struct SafekeeperConf { pub remote_storage: Option, pub backup_threads: Option, pub auth_enabled: bool, + pub listen_addr: Option, } impl Default for SafekeeperConf { @@ -158,10 +272,36 @@ impl Default for SafekeeperConf { remote_storage: None, backup_threads: None, auth_enabled: false, + listen_addr: None, } } } +#[derive(Clone, Copy)] +pub enum InitForceMode { + MustNotExist, + EmptyDirOk, + RemoveAllContents, +} + +impl ValueEnum for InitForceMode { + fn value_variants<'a>() -> &'a [Self] { + &[ + Self::MustNotExist, + Self::EmptyDirOk, + Self::RemoveAllContents, + ] + } + + fn to_possible_value(&self) -> Option { + Some(clap::builder::PossibleValue::new(match self { + InitForceMode::MustNotExist => "must-not-exist", + InitForceMode::EmptyDirOk => "empty-dir-ok", + InitForceMode::RemoveAllContents => "remove-all-contents", + })) + } +} + impl SafekeeperConf { /// Compute is served by port on which only tenant scoped tokens allowed, if /// it is configured. @@ -196,8 +336,12 @@ impl LocalEnv { self.neon_distrib_dir.join("pageserver") } - pub fn attachment_service_bin(&self) -> PathBuf { - self.neon_distrib_dir.join("attachment_service") + pub fn storage_controller_bin(&self) -> PathBuf { + // Irrespective of configuration, storage controller binary is always + // run from the same location as neon_local. This means that for compatibility + // tests that run old pageserver/safekeeper, they still run latest storage controller. + let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned(); + neon_local_bin_dir.join("storage_controller") } pub fn safekeeper_bin(&self) -> PathBuf { @@ -225,7 +369,13 @@ impl LocalEnv { if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) { Ok(conf) } else { - bail!("could not find pageserver {id}") + let have_ids = self + .pageservers + .iter() + .map(|node| format!("{}:{}", node.id, node.listen_http_addr)) + .collect::>(); + let joined = have_ids.join(","); + bail!("could not find pageserver {id}, have ids {joined}") } } @@ -280,44 +430,8 @@ impl LocalEnv { .collect() } - /// Create a LocalEnv from a config file. - /// - /// Unlike 'load_config', this function fills in any defaults that are missing - /// from the config file. - pub fn parse_config(toml: &str) -> anyhow::Result { - let mut env: LocalEnv = toml::from_str(toml)?; - - // Find postgres binaries. - // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". - // Note that later in the code we assume, that distrib dirs follow the same pattern - // for all postgres versions. - if env.pg_distrib_dir == Path::new("") { - if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { - env.pg_distrib_dir = postgres_bin.into(); - } else { - let cwd = env::current_dir()?; - env.pg_distrib_dir = cwd.join("pg_install") - } - } - - // Find neon binaries. - if env.neon_distrib_dir == Path::new("") { - env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned(); - } - - if env.pageservers.is_empty() { - anyhow::bail!("Configuration must contain at least one pageserver"); - } - - env.base_data_dir = base_path(); - - Ok(env) - } - - /// Locate and load config - pub fn load_config() -> anyhow::Result { - let repopath = base_path(); - + /// Construct `Self` from on-disk state. + pub fn load_config(repopath: &Path) -> anyhow::Result { if !repopath.exists() { bail!( "Neon config is not found in {}. You need to run 'neon_local init' first", @@ -328,38 +442,129 @@ impl LocalEnv { // TODO: check that it looks like a neon repository // load and parse file - let config = fs::read_to_string(repopath.join("config"))?; - let mut env: LocalEnv = toml::from_str(config.as_str())?; + let config_file_contents = fs::read_to_string(repopath.join("config"))?; + let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?; + let mut env = { + let OnDiskConfig { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } = on_disk_config; + LocalEnv { + base_data_dir: repopath.to_owned(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + private_key_path, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + branch_name_mappings, + } + }; - env.base_data_dir = repopath; + // The source of truth for pageserver configuration is the pageserver.toml. + assert!( + env.pageservers.is_empty(), + "we ensure this during deserialization" + ); + env.pageservers = { + let iter = std::fs::read_dir(repopath).context("open dir")?; + let mut pageservers = Vec::new(); + for res in iter { + let dentry = res?; + const PREFIX: &str = "pageserver_"; + let dentry_name = dentry + .file_name() + .into_string() + .ok() + .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path())) + .unwrap(); + if !dentry_name.starts_with(PREFIX) { + continue; + } + if !dentry.file_type().context("determine file type")?.is_dir() { + anyhow::bail!("expected a directory, got {:?}", dentry.path()); + } + let id = dentry_name[PREFIX.len()..] + .parse::() + .with_context(|| format!("parse id from {:?}", dentry.path()))?; + // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656) + #[derive(serde::Serialize, serde::Deserialize)] + // (allow unknown fields, unlike PageServerConf) + struct PageserverConfigTomlSubset { + id: NodeId, + listen_pg_addr: String, + listen_http_addr: String, + pg_auth_type: AuthType, + http_auth_type: AuthType, + } + let config_toml_path = dentry.path().join("pageserver.toml"); + let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( + &std::fs::read_to_string(&config_toml_path) + .with_context(|| format!("read {:?}", config_toml_path))?, + ) + .context("parse pageserver.toml")?; + let PageserverConfigTomlSubset { + id: config_toml_id, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + } = config_toml; + let conf = PageServerConf { + id: { + anyhow::ensure!( + config_toml_id == id, + "id mismatch: config_toml.id={config_toml_id} id={id}", + ); + id + }, + listen_pg_addr, + listen_http_addr, + pg_auth_type, + http_auth_type, + }; + pageservers.push(conf); + } + pageservers + }; Ok(env) } - pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { - // Currently, the user first passes a config file with 'neon_local init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .neon/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - let mut conf_content = r#"# This file describes a local deployment of the page server -# and safekeeeper node. It is read by the 'neon_local' command-line -# utility. -"# - .to_string(); - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; + pub fn persist_config(&self) -> anyhow::Result<()> { + Self::persist_config_impl( + &self.base_data_dir, + &OnDiskConfig { + pg_distrib_dir: self.pg_distrib_dir.clone(), + neon_distrib_dir: self.neon_distrib_dir.clone(), + default_tenant_id: self.default_tenant_id, + private_key_path: self.private_key_path.clone(), + broker: self.broker.clone(), + storage_controller: self.storage_controller.clone(), + pageservers: vec![], // it's skip_serializing anyway + safekeepers: self.safekeepers.clone(), + control_plane_api: self.control_plane_api.clone(), + control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + branch_name_mappings: self.branch_name_mappings.clone(), + }, + ) + } + pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> { + let conf_content = &toml::to_string_pretty(config)?; let target_config_path = base_path.join("config"); fs::write(&target_config_path, conf_content).with_context(|| { format!( @@ -371,117 +576,166 @@ impl LocalEnv { // this function is used only for testing purposes in CLI e g generate tokens during init pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result { - let private_key_path = if self.private_key_path.is_absolute() { - self.private_key_path.to_path_buf() - } else { - self.base_data_dir.join(&self.private_key_path) - }; - + let private_key_path = self.get_private_key_path(); let key_data = fs::read(private_key_path)?; encode_from_key_file(claims, &key_data) } - // - // Initialize a new Neon repository - // - pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> { - // check if config already exists - let base_path = &self.base_data_dir; - ensure!( - base_path != Path::new(""), - "repository base path is missing" - ); + pub fn get_private_key_path(&self) -> PathBuf { + if self.private_key_path.is_absolute() { + self.private_key_path.to_path_buf() + } else { + self.base_data_dir.join(&self.private_key_path) + } + } + /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`]. + pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> { + let base_path = base_path(); + assert_ne!(base_path, Path::new("")); + let base_path = &base_path; + + // create base_path dir if base_path.exists() { - if force { - println!("removing all contents of '{}'", base_path.display()); - // instead of directly calling `remove_dir_all`, we keep the original dir but removing - // all contents inside. This helps if the developer symbol links another directory (i.e., - // S3 local SSD) to the `.neon` base directory. - for entry in std::fs::read_dir(base_path)? { - let entry = entry?; - let path = entry.path(); - if path.is_dir() { - fs::remove_dir_all(&path)?; - } else { - fs::remove_file(&path)?; + match force { + InitForceMode::MustNotExist => { + bail!( + "directory '{}' already exists. Perhaps already initialized?", + base_path.display() + ); + } + InitForceMode::EmptyDirOk => { + if let Some(res) = std::fs::read_dir(base_path)?.next() { + res.context("check if directory is empty")?; + anyhow::bail!("directory not empty: {base_path:?}"); + } + } + InitForceMode::RemoveAllContents => { + println!("removing all contents of '{}'", base_path.display()); + // instead of directly calling `remove_dir_all`, we keep the original dir but removing + // all contents inside. This helps if the developer symbol links another directory (i.e., + // S3 local SSD) to the `.neon` base directory. + for entry in std::fs::read_dir(base_path)? { + let entry = entry?; + let path = entry.path(); + if path.is_dir() { + fs::remove_dir_all(&path)?; + } else { + fs::remove_file(&path)?; + } } } - } else { - bail!( - "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)", - base_path.display() - ); } } - - if !self.pg_bin_dir(pg_version)?.join("postgres").exists() { - bail!( - "Can't find postgres binary at {}", - self.pg_bin_dir(pg_version)?.display() - ); - } - for binary in ["pageserver", "safekeeper"] { - if !self.neon_distrib_dir.join(binary).exists() { - bail!( - "Can't find binary '{binary}' in neon distrib dir '{}'", - self.neon_distrib_dir.display() - ); - } - } - if !base_path.exists() { fs::create_dir(base_path)?; } + let NeonLocalInitConf { + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id, + broker, + storage_controller, + pageservers, + safekeepers, + control_plane_api, + control_plane_compute_hook_api, + } = conf; + + // Find postgres binaries. + // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install". + // Note that later in the code we assume, that distrib dirs follow the same pattern + // for all postgres versions. + let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| { + if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") { + postgres_bin.into() + } else { + let cwd = env::current_dir().unwrap(); + cwd.join("pg_install") + } + }); + + // Find neon binaries. + let neon_distrib_dir = neon_distrib_dir + .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned()); + // Generate keypair for JWT. // // The keypair is only needed if authentication is enabled in any of the // components. For convenience, we generate the keypair even if authentication // is not enabled, so that you can easily enable it after the initialization - // step. However, if the key generation fails, we treat it as non-fatal if - // authentication was not enabled. - if self.private_key_path == PathBuf::new() { - match generate_auth_keys( - base_path.join("auth_private_key.pem").as_path(), - base_path.join("auth_public_key.pem").as_path(), - ) { - Ok(()) => { - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } - Err(e) => { - if !self.auth_keys_needed() { - eprintln!("Could not generate keypair for JWT authentication: {e}"); - eprintln!("Continuing anyway because authentication was not enabled"); - self.private_key_path = PathBuf::from("auth_private_key.pem"); - } else { - return Err(e); - } - } - } + // step. + generate_auth_keys( + base_path.join("auth_private_key.pem").as_path(), + base_path.join("auth_public_key.pem").as_path(), + ) + .context("generate auth keys")?; + let private_key_path = PathBuf::from("auth_private_key.pem"); + + // create the runtime type because the remaining initialization code below needs + // a LocalEnv instance op operation + // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state + let env = LocalEnv { + base_data_dir: base_path.clone(), + pg_distrib_dir, + neon_distrib_dir, + default_tenant_id: Some(default_tenant_id), + private_key_path, + broker, + storage_controller: storage_controller.unwrap_or_default(), + pageservers: pageservers.iter().map(Into::into).collect(), + safekeepers, + control_plane_api: control_plane_api.unwrap_or_default(), + control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + branch_name_mappings: Default::default(), + }; + + // create endpoints dir + fs::create_dir_all(env.endpoints_path())?; + + // create safekeeper dirs + for safekeeper in &env.safekeepers { + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; } - fs::create_dir_all(self.endpoints_path())?; - - for safekeeper in &self.safekeepers { - fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; + // initialize pageserver state + for (i, ps) in pageservers.into_iter().enumerate() { + let runtime_ps = &env.pageservers[i]; + assert_eq!(&PageServerConf::from(&ps), runtime_ps); + fs::create_dir(env.pageserver_data_dir(ps.id))?; + PageServerNode::from_env(&env, runtime_ps) + .initialize(ps) + .context("pageserver init failed")?; } - self.persist_config(base_path) - } + // setup remote remote location for default LocalFs remote storage + std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; - fn auth_keys_needed(&self) -> bool { - self.pageservers.iter().any(|ps| { - ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT - }) || self.safekeepers.iter().any(|sk| sk.auth_enabled) + env.persist_config() } } -fn base_path() -> PathBuf { - match std::env::var_os("NEON_REPO_DIR") { - Some(val) => PathBuf::from(val), - None => PathBuf::from(".neon"), - } +pub fn base_path() -> PathBuf { + let path = match std::env::var_os("NEON_REPO_DIR") { + Some(val) => { + let path = PathBuf::from(val); + if !path.is_absolute() { + // repeat the env var in the error because our default is always absolute + panic!("NEON_REPO_DIR must be an absolute path, got {path:?}"); + } + path + } + None => { + let pwd = std::env::current_dir() + // technically this can fail but it's quite unlikeley + .expect("determine current directory"); + let pwd_abs = pwd.canonicalize().expect("canonicalize current directory"); + pwd_abs.join(".neon") + } + }; + assert!(path.is_absolute()); + path } /// Generate a public/private key pair for JWT authentication @@ -520,31 +774,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow } Ok(()) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn simple_conf_parsing() { - let simple_conf_toml = include_str!("../simple.conf"); - let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml); - assert!( - simple_conf_parse_result.is_ok(), - "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}" - ); - - let string_to_replace = "listen_addr = '127.0.0.1:50051'"; - let spoiled_url_str = "listen_addr = '!@$XOXO%^&'"; - let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str); - assert!( - spoiled_url_toml.contains(spoiled_url_str), - "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}" - ); - let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml); - assert!( - spoiled_url_parse_result.is_err(), - "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}" - ); - } -} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index fb0d251722..13e684da24 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -4,20 +4,22 @@ //! //! .neon/ //! -use std::borrow::Cow; use std::collections::HashMap; use std::io; use std::io::Write; use std::num::NonZeroU64; use std::path::PathBuf; -use std::process::{Child, Command}; +use std::str::FromStr; use std::time::Duration; use anyhow::{bail, Context}; use camino::Utf8PathBuf; use futures::SinkExt; -use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo}; +use pageserver_api::models::{ + self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, + TimelineInfo, +}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; @@ -28,7 +30,7 @@ use utils::{ lsn::Lsn, }; -use crate::local_env::PageServerConf; +use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf}; use crate::{background_process, local_env::LocalEnv}; /// Directory within .neon which will be used by default for LocalFs remote storage. @@ -72,68 +74,75 @@ impl PageServerNode { } } - /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration. - /// - /// These all end up on the command line of the `pageserver` binary. - fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec { - let id = format!("id={}", self.conf.id); + fn pageserver_init_make_toml( + &self, + conf: NeonLocalInitPageserverConf, + ) -> anyhow::Result { + assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully"); + + // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656) + // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let pg_distrib_dir_param = format!( "pg_distrib_dir='{}'", self.env.pg_distrib_dir_raw().display() ); - let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type); - let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr); - - let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type); - let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr); - let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url()); - let mut overrides = vec![ - id, - pg_distrib_dir_param, - http_auth_type_param, - pg_auth_type_param, - listen_http_addr_param, - listen_pg_addr_param, - broker_endpoint_param, - ]; + let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param]; if let Some(control_plane_api) = &self.env.control_plane_api { overrides.push(format!( "control_plane_api='{}'", control_plane_api.as_str() )); + + // Storage controller uses the same auth as pageserver: if JWT is enabled + // for us, we will also need it to talk to them. + if matches!(conf.http_auth_type, AuthType::NeonJWT) { + let jwt_token = self + .env + .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) + .unwrap(); + overrides.push(format!("control_plane_api_token='{}'", jwt_token)); + } } - if !cli_overrides - .iter() - .any(|c| c.starts_with("remote_storage")) - { + if !conf.other.contains_key("remote_storage") { overrides.push(format!( "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}" )); } - if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust - { + if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust { // Keys are generated in the toplevel repo dir, pageservers' workdirs // are one level below that, so refer to keys with ../ overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned()); } // Apply the user-provided overrides - overrides.extend(cli_overrides.iter().map(|&c| c.to_owned())); + overrides.push( + toml_edit::ser::to_string_pretty(&conf) + .expect("we deserialized this from toml earlier"), + ); - overrides + // Turn `overrides` into a toml document. + // TODO: above code is legacy code, it should be refactored to use toml_edit directly. + let mut config_toml = toml_edit::Document::new(); + for fragment_str in overrides { + let fragment = toml_edit::Document::from_str(&fragment_str) + .expect("all fragments in `overrides` are valid toml documents, this function controls that"); + for (key, item) in fragment.iter() { + config_toml.insert(key, item.clone()); + } + } + Ok(config_toml) } /// Initializes a pageserver node by creating its config with the overrides provided. - pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> { - // First, run `pageserver --init` and wait for it to write a config into FS and exit. - self.pageserver_init(config_overrides) + pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { + self.pageserver_init(conf) .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id)) } @@ -149,11 +158,11 @@ impl PageServerNode { .expect("non-Unicode path") } - pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result { - self.start_node(config_overrides, false).await + pub async fn start(&self) -> anyhow::Result<()> { + self.start_node().await } - fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> { + fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> { let datadir = self.repo_path(); let node_id = self.conf.id; println!( @@ -164,38 +173,48 @@ impl PageServerNode { ); io::stdout().flush()?; - if !datadir.exists() { - std::fs::create_dir(&datadir)?; - } + let config = self + .pageserver_init_make_toml(conf) + .context("make pageserver toml")?; + let config_file_path = datadir.join("pageserver.toml"); + let mut config_file = std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&config_file_path) + .with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?; + config_file + .write_all(config.to_string().as_bytes()) + .context("write pageserver toml")?; + drop(config_file); + // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config - let datadir_path_str = datadir.to_str().with_context(|| { - format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}") - })?; - let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); - args.push(Cow::Borrowed("--init")); + // Write metadata file, used by pageserver on startup to register itself with + // the storage controller + let metadata_path = datadir.join("metadata.json"); - let init_output = Command::new(self.env.pageserver_bin()) - .args(args.iter().map(Cow::as_ref)) - .envs(self.pageserver_env_variables()?) - .output() - .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?; - - anyhow::ensure!( - init_output.status.success(), - "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}", - node_id, - String::from_utf8_lossy(&init_output.stdout), - String::from_utf8_lossy(&init_output.stderr), - ); + let (_http_host, http_port) = + parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr"); + let http_port = http_port.unwrap_or(9898); + // Intentionally hand-craft JSON: this acts as an implicit format compat test + // in case the pageserver-side structure is edited, and reflects the real life + // situation: the metadata is written by some other script. + std::fs::write( + metadata_path, + serde_json::to_vec(&pageserver_api::config::NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: self.pg_connection_config.port(), + http_host: "localhost".to_string(), + http_port, + other: HashMap::new(), + }) + .unwrap(), + ) + .expect("Failed to write metadata file"); Ok(()) } - async fn start_node( - &self, - config_overrides: &[&str], - update_config: bool, - ) -> anyhow::Result { + async fn start_node(&self) -> anyhow::Result<()> { // TODO: using a thread here because start_process() is not async but we need to call check_status() let datadir = self.repo_path(); print!( @@ -212,15 +231,12 @@ impl PageServerNode { self.conf.id, datadir, ) })?; - let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str); - if update_config { - args.push(Cow::Borrowed("--update-config")); - } + let args = vec!["-D", datadir_path_str]; background_process::start_process( "pageserver", &datadir, &self.env.pageserver_bin(), - args.iter().map(Cow::as_ref), + args, self.pageserver_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), || async { @@ -232,23 +248,9 @@ impl PageServerNode { } }, ) - .await - } + .await?; - fn pageserver_basic_args<'a>( - &self, - config_overrides: &'a [&'a str], - datadir_path_str: &'a str, - ) -> Vec> { - let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)]; - - let overrides = self.neon_local_overrides(config_overrides); - for config_override in overrides { - args.push(Cow::Borrowed("-c")); - args.push(Cow::Owned(config_override)); - } - - args + Ok(()) } fn pageserver_env_variables(&self) -> anyhow::Result> { @@ -301,16 +303,8 @@ impl PageServerNode { pub async fn tenant_list(&self) -> mgmt_api::Result> { self.http_client.list_tenants().await } - - pub async fn tenant_create( - &self, - new_tenant_id: TenantId, - generation: Option, - settings: HashMap<&str, &str>, - ) -> anyhow::Result { - let mut settings = settings.clone(); - - let config = models::TenantConfig { + pub fn parse_config(mut settings: HashMap<&str, &str>) -> anyhow::Result { + let result = models::TenantConfig { checkpoint_distance: settings .remove("checkpoint_distance") .map(|x| x.parse::()) @@ -325,6 +319,11 @@ impl PageServerNode { .remove("compaction_threshold") .map(|x| x.parse::()) .transpose()?, + compaction_algorithm: settings + .remove("compaction_algorithm") + .map(serde_json::from_str) + .transpose() + .context("Failed to parse 'compaction_algorithm' json")?, gc_horizon: settings .remove("gc_horizon") .map(|x| x.parse::()) @@ -334,6 +333,10 @@ impl PageServerNode { .remove("image_creation_threshold") .map(|x| x.parse::()) .transpose()?, + image_layer_creation_check_threshold: settings + .remove("image_layer_creation_check_threshold") + .map(|x| x.parse::()) + .transpose()?, pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") @@ -364,18 +367,49 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), - gc_feedback: settings - .remove("gc_feedback") + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + lazy_slru_download: settings + .remove("lazy_slru_download") .map(|x| x.parse::()) .transpose() - .context("Failed to parse 'gc_feedback' as bool")?, - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .context("Failed to parse 'lazy_slru_download' as bool")?, + timeline_get_throttle: settings + .remove("timeline_get_throttle") + .map(serde_json::from_str) + .transpose() + .context("parse `timeline_get_throttle` from json")?, + switch_aux_file_policy: settings + .remove("switch_aux_file_policy") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'switch_aux_file_policy'")?, + lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), + lsn_lease_length_for_ts: settings + .remove("lsn_lease_length_for_ts") + .map(|x| x.to_string()), }; + if !settings.is_empty() { + bail!("Unrecognized tenant settings: {settings:?}") + } else { + Ok(result) + } + } + + pub async fn tenant_create( + &self, + new_tenant_id: TenantId, + generation: Option, + settings: HashMap<&str, &str>, + ) -> anyhow::Result { + let config = Self::parse_config(settings.clone())?; let request = models::TenantCreateRequest { new_tenant_id: TenantShardId::unsharded(new_tenant_id), generation, config, + shard_parameters: ShardParameters::default(), + // Placement policy is not meaningful for creations not done via storage controller + placement_policy: None, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -408,6 +442,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'compaction_threshold' as an integer")?, + compaction_algorithm: settings + .remove("compactin_algorithm") + .map(serde_json::from_str) + .transpose() + .context("Failed to parse 'compaction_algorithm' json")?, gc_horizon: settings .remove("gc_horizon") .map(|x| x.parse::()) @@ -419,6 +458,12 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_threshold' as non zero integer")?, + image_layer_creation_check_threshold: settings + .remove("image_layer_creation_check_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_check_threshold' as integer")?, + pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") @@ -449,12 +494,26 @@ impl PageServerNode { evictions_low_residence_duration_metric_threshold: settings .remove("evictions_low_residence_duration_metric_threshold") .map(|x| x.to_string()), - gc_feedback: settings - .remove("gc_feedback") + heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + lazy_slru_download: settings + .remove("lazy_slru_download") .map(|x| x.parse::()) .transpose() - .context("Failed to parse 'gc_feedback' as bool")?, - heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()), + .context("Failed to parse 'lazy_slru_download' as bool")?, + timeline_get_throttle: settings + .remove("timeline_get_throttle") + .map(serde_json::from_str) + .transpose() + .context("parse `timeline_get_throttle` from json")?, + switch_aux_file_policy: settings + .remove("switch_aux_file_policy") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'switch_aux_file_policy'")?, + lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), + lsn_lease_length_for_ts: settings + .remove("lsn_lease_length_for_ts") + .map(|x| x.to_string()), } }; @@ -471,38 +530,33 @@ impl PageServerNode { pub async fn location_config( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, config: LocationConfig, flush_ms: Option, + lazy: bool, ) -> anyhow::Result<()> { Ok(self .http_client - .location_config(tenant_id, config, flush_ms) + .location_config(tenant_shard_id, config, flush_ms, lazy) .await?) } - pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result> { - Ok(self.http_client.list_timelines(*tenant_id).await?) - } - - pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> { - Ok(self - .http_client - .tenant_secondary_download(*tenant_id) - .await?) + pub async fn timeline_list( + &self, + tenant_shard_id: &TenantShardId, + ) -> anyhow::Result> { + Ok(self.http_client.list_timelines(*tenant_shard_id).await?) } pub async fn timeline_create( &self, - tenant_id: TenantId, - new_timeline_id: Option, + tenant_shard_id: TenantShardId, + new_timeline_id: TimelineId, ancestor_start_lsn: Option, ancestor_timeline_id: Option, pg_version: Option, existing_initdb_timeline_id: Option, ) -> anyhow::Result { - // If timeline ID was not specified, generate one - let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate()); let req = models::TimelineCreateRequest { new_timeline_id, ancestor_start_lsn, @@ -510,7 +564,10 @@ impl PageServerNode { pg_version, existing_initdb_timeline_id, }; - Ok(self.http_client.timeline_create(tenant_id, &req).await?) + Ok(self + .http_client + .timeline_create(tenant_shard_id, &req) + .await?) } /// Import a basebackup prepared using either: @@ -538,7 +595,7 @@ impl PageServerNode { eprintln!("connection error: {}", e); } }); - tokio::pin!(client); + let client = std::pin::pin!(client); // Init base reader let (start_lsn, base_tarfile_path) = base; @@ -588,4 +645,14 @@ impl PageServerNode { Ok(()) } + + pub async fn tenant_synthetic_size( + &self, + tenant_shard_id: TenantShardId, + ) -> anyhow::Result { + Ok(self + .http_client + .tenant_synthetic_size(tenant_shard_id) + .await?) + } } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 4026ef0eb9..4a320ce53d 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -7,7 +7,6 @@ //! ``` use std::io::Write; use std::path::PathBuf; -use std::process::Child; use std::{io, result}; use anyhow::Context; @@ -15,6 +14,7 @@ use camino::Utf8PathBuf; use postgres_connection::PgConnectionConfig; use reqwest::{IntoUrl, Method}; use thiserror::Error; +use utils::auth::{Claims, Scope}; use utils::{http::error::HttpErrorBody, id::NodeId}; use crate::{ @@ -71,24 +71,31 @@ pub struct SafekeeperNode { pub pg_connection_config: PgConnectionConfig, pub env: LocalEnv, pub http_client: reqwest::Client, + pub listen_addr: String, pub http_base_url: String, } impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { + let listen_addr = if let Some(ref listen_addr) = conf.listen_addr { + listen_addr.clone() + } else { + "127.0.0.1".to_string() + }; SafekeeperNode { id: conf.id, conf: conf.clone(), - pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), + pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port), env: env.clone(), http_client: reqwest::Client::new(), - http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port), + http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port), + listen_addr, } } /// Construct libpq connection string for connecting to this safekeeper. - fn safekeeper_connection_config(port: u16) -> PgConnectionConfig { - PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port) + fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig { + PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port) } pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf { @@ -104,7 +111,7 @@ impl SafekeeperNode { .expect("non-Unicode path") } - pub async fn start(&self, extra_opts: Vec) -> anyhow::Result { + pub async fn start(&self, extra_opts: Vec) -> anyhow::Result<()> { print!( "Starting safekeeper at '{}' in '{}'", self.pg_connection_config.raw_address(), @@ -112,8 +119,8 @@ impl SafekeeperNode { ); io::stdout().flush().unwrap(); - let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port); - let listen_http = format!("127.0.0.1:{}", self.conf.http_port); + let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port); + let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port); let id = self.id; let datadir = self.datadir_path(); @@ -140,7 +147,7 @@ impl SafekeeperNode { availability_zone, ]; if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port { - let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port); + let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port); args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]); } if !self.conf.sync { @@ -191,7 +198,7 @@ impl SafekeeperNode { &datadir, &self.env.safekeeper_bin(), &args, - [], + self.safekeeper_env_variables()?, background_process::InitialPidFile::Expect(self.pid_file()), || async { match self.check_status().await { @@ -204,6 +211,18 @@ impl SafekeeperNode { .await } + fn safekeeper_env_variables(&self) -> anyhow::Result> { + // Generate a token to connect from safekeeper to peers + if self.conf.auth_enabled { + let token = self + .env + .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?; + Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)]) + } else { + Ok(Vec::new()) + } + } + /// /// Stop the server. /// diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs new file mode 100644 index 0000000000..4f9f0ba794 --- /dev/null +++ b/control_plane/src/storage_controller.rs @@ -0,0 +1,577 @@ +use crate::{ + background_process, + local_env::{LocalEnv, NeonStorageControllerConf}, +}; +use camino::{Utf8Path, Utf8PathBuf}; +use pageserver_api::{ + controller_api::{ + NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse, + TenantShardMigrateRequest, TenantShardMigrateResponse, + }, + models::{ + TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse, + TimelineCreateRequest, TimelineInfo, + }, + shard::{ShardStripeSize, TenantShardId}, +}; +use pageserver_client::mgmt_api::ResponseErrorMessageExt; +use postgres_backend::AuthType; +use reqwest::Method; +use serde::{de::DeserializeOwned, Deserialize, Serialize}; +use std::{fs, str::FromStr}; +use tokio::process::Command; +use tracing::instrument; +use url::Url; +use utils::{ + auth::{encode_from_key_file, Claims, Scope}, + id::{NodeId, TenantId}, +}; + +pub struct StorageController { + env: LocalEnv, + listen: String, + path: Utf8PathBuf, + private_key: Option>, + public_key: Option, + postgres_port: u16, + client: reqwest::Client, + config: NeonStorageControllerConf, +} + +const COMMAND: &str = "storage_controller"; + +const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; + +#[derive(Serialize, Deserialize)] +pub struct AttachHookRequest { + pub tenant_shard_id: TenantShardId, + pub node_id: Option, + pub generation_override: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct AttachHookResponse { + pub gen: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct InspectRequest { + pub tenant_shard_id: TenantShardId, +} + +#[derive(Serialize, Deserialize)] +pub struct InspectResponse { + pub attachment: Option<(u32, NodeId)>, +} + +impl StorageController { + pub fn from_env(env: &LocalEnv) -> Self { + let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone()) + .unwrap() + .join("attachments.json"); + + // Makes no sense to construct this if pageservers aren't going to use it: assume + // pageservers have control plane API set + let listen_url = env.control_plane_api.clone().unwrap(); + + let listen = format!( + "{}:{}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + ); + + // Convention: NeonEnv in python tests reserves the next port after the control_plane_api + // port, for use by our captive postgres. + let postgres_port = listen_url + .port() + .expect("Control plane API setting should always have a port") + + 1; + + // Assume all pageservers have symmetric auth configuration: this service + // expects to use one JWT token to talk to all of them. + let ps_conf = env + .pageservers + .first() + .expect("Config is validated to contain at least one pageserver"); + let (private_key, public_key) = match ps_conf.http_auth_type { + AuthType::Trust => (None, None), + AuthType::NeonJWT => { + let private_key_path = env.get_private_key_path(); + let private_key = fs::read(private_key_path).expect("failed to read private key"); + + // If pageserver auth is enabled, this implicitly enables auth for this service, + // using the same credentials. + let public_key_path = + camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem")) + .unwrap(); + + // This service takes keys as a string rather than as a path to a file/dir: read the key into memory. + let public_key = if std::fs::metadata(&public_key_path) + .expect("Can't stat public key") + .is_dir() + { + // Our config may specify a directory: this is for the pageserver's ability to handle multiple + // keys. We only use one key at a time, so, arbitrarily load the first one in the directory. + let mut dir = + std::fs::read_dir(&public_key_path).expect("Can't readdir public key path"); + let dent = dir + .next() + .expect("Empty key dir") + .expect("Error reading key dir"); + + std::fs::read_to_string(dent.path()).expect("Can't read public key") + } else { + std::fs::read_to_string(&public_key_path).expect("Can't read public key") + }; + (Some(private_key), Some(public_key)) + } + }; + + Self { + env: env.clone(), + path, + listen, + private_key, + public_key, + postgres_port, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + config: env.storage_controller.clone(), + } + } + + fn pid_file(&self) -> Utf8PathBuf { + Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid")) + .expect("non-Unicode path") + } + + /// PIDFile for the postgres instance used to store storage controller state + fn postgres_pid_file(&self) -> Utf8PathBuf { + Utf8PathBuf::from_path_buf( + self.env + .base_data_dir + .join("storage_controller_postgres.pid"), + ) + .expect("non-Unicode path") + } + + /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl` + /// + /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back + /// to other versions if that one isn't found. Some automated tests create circumstances + /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. + pub async fn get_pg_bin_dir(&self) -> anyhow::Result { + let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; + + for v in prefer_versions { + let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap(); + if tokio::fs::try_exists(&path).await? { + return Ok(path); + } + } + + // Fall through + anyhow::bail!( + "Postgres binaries not found in {}", + self.env.pg_distrib_dir.display() + ); + } + + /// Readiness check for our postgres process + async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result { + let bin_path = pg_bin_dir.join("pg_isready"); + let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)]; + let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?; + + Ok(exitcode.success()) + } + + /// Create our database if it doesn't exist, and run migrations. + /// + /// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement + /// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers + /// who just want to run `cargo neon_local` without knowing about diesel. + /// + /// Returns the database url + pub async fn setup_database(&self) -> anyhow::Result { + const DB_NAME: &str = "storage_controller"; + let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port); + + let pg_bin_dir = self.get_pg_bin_dir().await?; + let createdb_path = pg_bin_dir.join("createdb"); + let output = Command::new(&createdb_path) + .args([ + "-h", + "localhost", + "-p", + &format!("{}", self.postgres_port), + DB_NAME, + ]) + .output() + .await + .expect("Failed to spawn createdb"); + + if !output.status.success() { + let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb"); + if stderr.contains("already exists") { + tracing::info!("Database {DB_NAME} already exists"); + } else { + anyhow::bail!("createdb failed with status {}: {stderr}", output.status); + } + } + + Ok(database_url) + } + + pub async fn start(&self) -> anyhow::Result<()> { + // Start a vanilla Postgres process used by the storage controller for persistence. + let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone()) + .unwrap() + .join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + let pg_log_path = pg_data_path.join("postgres.log"); + + if !tokio::fs::try_exists(&pg_data_path).await? { + // Initialize empty database + let initdb_path = pg_bin_dir.join("initdb"); + let mut child = Command::new(&initdb_path) + .args(["-D", pg_data_path.as_ref()]) + .spawn() + .expect("Failed to spawn initdb"); + let status = child.wait().await?; + if !status.success() { + anyhow::bail!("initdb failed with status {status}"); + } + + // Write a minimal config file: + // - Specify the port, since this is chosen dynamically + // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing + // the storage controller we don't want a slow local disk to interfere with that. + tokio::fs::write( + &pg_data_path.join("postgresql.conf"), + format!("port = {}\nfsync=off\n", self.postgres_port), + ) + .await?; + }; + + println!("Starting storage controller database..."); + let db_start_args = [ + "-w", + "-D", + pg_data_path.as_ref(), + "-l", + pg_log_path.as_ref(), + "start", + ]; + + background_process::start_process( + "storage_controller_db", + &self.env.base_data_dir, + pg_bin_dir.join("pg_ctl").as_std_path(), + db_start_args, + [], + background_process::InitialPidFile::Create(self.postgres_pid_file()), + || self.pg_isready(&pg_bin_dir), + ) + .await?; + + // Run migrations on every startup, in case something changed. + let database_url = self.setup_database().await?; + + let mut args = vec![ + "-l", + &self.listen, + "-p", + self.path.as_ref(), + "--dev", + "--database-url", + &database_url, + "--max-unavailable-interval", + &humantime::Duration::from(self.config.max_unavailable).to_string(), + ] + .into_iter() + .map(|s| s.to_string()) + .collect::>(); + if let Some(private_key) = &self.private_key { + let claims = Claims::new(None, Scope::PageServerApi); + let jwt_token = + encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); + args.push(format!("--jwt-token={jwt_token}")); + } + + if let Some(public_key) = &self.public_key { + args.push(format!("--public-key=\"{public_key}\"")); + } + + if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api { + args.push(format!( + "--compute-hook-url={control_plane_compute_hook_api}" + )); + } + + if let Some(split_threshold) = self.config.split_threshold.as_ref() { + args.push(format!("--split-threshold={split_threshold}")) + } + + args.push(format!( + "--neon-local-repo-dir={}", + self.env.base_data_dir.display() + )); + + background_process::start_process( + COMMAND, + &self.env.base_data_dir, + &self.env.storage_controller_bin(), + args, + [], + background_process::InitialPidFile::Create(self.pid_file()), + || async { + match self.ready().await { + Ok(_) => Ok(true), + Err(_) => Ok(false), + } + }, + ) + .await?; + + Ok(()) + } + + pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> { + background_process::stop_process(immediate, COMMAND, &self.pid_file())?; + + let pg_data_path = self.env.base_data_dir.join("storage_controller_db"); + let pg_bin_dir = self.get_pg_bin_dir().await?; + + println!("Stopping storage controller database..."); + let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"]; + let stop_status = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_stop_args) + .spawn()? + .wait() + .await?; + if !stop_status.success() { + let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"]; + let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl")) + .args(pg_status_args) + .spawn()? + .wait() + .await?; + + // pg_ctl status returns this exit code if postgres is not running: in this case it is + // fine that stop failed. Otherwise it is an error that stop failed. + const PG_STATUS_NOT_RUNNING: i32 = 3; + if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() { + println!("Storage controller database is already stopped"); + return Ok(()); + } else { + anyhow::bail!("Failed to stop storage controller database: {stop_status}") + } + } + + Ok(()) + } + + fn get_claims_for_path(path: &str) -> anyhow::Result> { + let category = match path.find('/') { + Some(idx) => &path[..idx], + None => path, + }; + + match category { + "status" | "ready" => Ok(None), + "control" | "debug" => Ok(Some(Claims::new(None, Scope::Admin))), + "v1" => Ok(Some(Claims::new(None, Scope::PageServerApi))), + _ => Err(anyhow::anyhow!("Failed to determine claims for {}", path)), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch( + &self, + method: reqwest::Method, + path: String, + body: Option, + ) -> anyhow::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let listen_url = self.env.control_plane_api.clone().unwrap(); + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + listen_url.host_str().unwrap(), + listen_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(private_key) = &self.private_key { + println!("Getting claims for path {}", path); + if let Some(required_claims) = Self::get_claims_for_path(&path)? { + println!("Got claims {:?} for path {}", required_claims, path); + let jwt_token = encode_from_key_file(&required_claims, private_key)?; + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + } + + let response = builder.send().await?; + let response = response.error_from_body().await?; + + Ok(response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + } + + /// Call into the attach_hook API, for use before handing out attachments to pageservers + #[instrument(skip(self))] + pub async fn attach_hook( + &self, + tenant_shard_id: TenantShardId, + pageserver_id: NodeId, + ) -> anyhow::Result> { + let request = AttachHookRequest { + tenant_shard_id, + node_id: Some(pageserver_id), + generation_override: None, + }; + + let response = self + .dispatch::<_, AttachHookResponse>( + Method::POST, + "debug/v1/attach-hook".to_string(), + Some(request), + ) + .await?; + + Ok(response.gen) + } + + #[instrument(skip(self))] + pub async fn inspect( + &self, + tenant_shard_id: TenantShardId, + ) -> anyhow::Result> { + let request = InspectRequest { tenant_shard_id }; + + let response = self + .dispatch::<_, InspectResponse>( + Method::POST, + "debug/v1/inspect".to_string(), + Some(request), + ) + .await?; + + Ok(response.attachment) + } + + #[instrument(skip(self))] + pub async fn tenant_create( + &self, + req: TenantCreateRequest, + ) -> anyhow::Result { + self.dispatch(Method::POST, "v1/tenant".to_string(), Some(req)) + .await + } + + #[instrument(skip(self))] + pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result { + self.dispatch::<(), TenantCreateResponse>( + Method::POST, + format!("debug/v1/tenant/{tenant_id}/import"), + None, + ) + .await + } + + #[instrument(skip(self))] + pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result { + self.dispatch::<(), _>( + Method::GET, + format!("debug/v1/tenant/{tenant_id}/locate"), + None, + ) + .await + } + + #[instrument(skip(self))] + pub async fn tenant_migrate( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + ) -> anyhow::Result { + self.dispatch( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate"), + Some(TenantShardMigrateRequest { + tenant_shard_id, + node_id, + }), + ) + .await + } + + #[instrument(skip(self), fields(%tenant_id, %new_shard_count))] + pub async fn tenant_split( + &self, + tenant_id: TenantId, + new_shard_count: u8, + new_stripe_size: Option, + ) -> anyhow::Result { + self.dispatch( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/shard_split"), + Some(TenantShardSplitRequest { + new_shard_count, + new_stripe_size, + }), + ) + .await + } + + #[instrument(skip_all, fields(node_id=%req.node_id))] + pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> { + self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req)) + .await + } + + #[instrument(skip_all, fields(node_id=%req.node_id))] + pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> { + self.dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{}/config", req.node_id), + Some(req), + ) + .await + } + + #[instrument(skip(self))] + pub async fn ready(&self) -> anyhow::Result<()> { + self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None) + .await + } + + #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))] + pub async fn tenant_timeline_create( + &self, + tenant_id: TenantId, + req: TimelineCreateRequest, + ) -> anyhow::Result { + self.dispatch( + Method::POST, + format!("v1/tenant/{tenant_id}/timeline"), + Some(req), + ) + .await + } +} diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs deleted file mode 100644 index 23ea8f4060..0000000000 --- a/control_plane/src/tenant_migration.rs +++ /dev/null @@ -1,220 +0,0 @@ -//! -//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code -//! isn't scoped to a particular physical service, as it needs to update compute endpoints to -//! point to the new pageserver. -//! -use crate::local_env::LocalEnv; -use crate::{ - attachment_service::AttachmentService, endpoint::ComputeControlPlane, - pageserver::PageServerNode, -}; -use pageserver_api::models::{ - LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, -}; -use pageserver_api::shard::TenantShardId; -use std::collections::HashMap; -use std::time::Duration; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; - -/// Given an attached pageserver, retrieve the LSN for all timelines -async fn get_lsns( - tenant_id: TenantId, - pageserver: &PageServerNode, -) -> anyhow::Result> { - let timelines = pageserver.timeline_list(&tenant_id).await?; - Ok(timelines - .into_iter() - .map(|t| (t.timeline_id, t.last_record_lsn)) - .collect()) -} - -/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake -/// `baseline`. -async fn await_lsn( - tenant_id: TenantId, - pageserver: &PageServerNode, - baseline: HashMap, -) -> anyhow::Result<()> { - loop { - let latest = match get_lsns(tenant_id, pageserver).await { - Ok(l) => l, - Err(_e) => { - println!( - "🕑 Waiting for pageserver {} to activate...", - pageserver.conf.id - ); - std::thread::sleep(Duration::from_millis(500)); - continue; - } - }; - - let mut any_behind: bool = false; - for (timeline_id, baseline_lsn) in &baseline { - match latest.get(timeline_id) { - Some(latest_lsn) => { - println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); - if latest_lsn < baseline_lsn { - any_behind = true; - } - } - None => { - // Expected timeline isn't yet visible on migration destination. - // (IRL we would have to account for timeline deletion, but this - // is just test helper) - any_behind = true; - } - } - } - - if !any_behind { - println!("✅ LSN caught up. Proceeding..."); - break; - } else { - std::thread::sleep(Duration::from_millis(500)); - } - } - - Ok(()) -} - -/// This function spans multiple services, to demonstrate live migration of a tenant -/// between pageservers: -/// - Coordinate attach/secondary/detach on pageservers -/// - call into attachment_service for generations -/// - reconfigure compute endpoints to point to new attached pageserver -pub async fn migrate_tenant( - env: &LocalEnv, - tenant_id: TenantId, - dest_ps: PageServerNode, -) -> anyhow::Result<()> { - println!("🤔 Checking existing status..."); - let attachment_service = AttachmentService::from_env(env); - - fn build_location_config( - mode: LocationConfigMode, - generation: Option, - secondary_conf: Option, - ) -> LocationConfig { - LocationConfig { - mode, - generation, - secondary_conf, - tenant_conf: TenantConfig::default(), - shard_number: 0, - shard_count: 0, - shard_stripe_size: 0, - } - } - - let previous = attachment_service.inspect(tenant_id).await?; - let mut baseline_lsns = None; - if let Some((generation, origin_ps_id)) = &previous { - let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?); - - if origin_ps_id == &dest_ps.conf.id { - println!("🔁 Already attached to {origin_ps_id}, freshening..."); - let gen = attachment_service - .attach_hook(tenant_id, dest_ps.conf.id) - .await?; - let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None); - dest_ps.location_config(tenant_id, dest_conf, None).await?; - println!("✅ Migration complete"); - return Ok(()); - } - - println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode"); - - let stale_conf = - build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None); - origin_ps - .location_config(tenant_id, stale_conf, Some(Duration::from_secs(10))) - .await?; - - baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?); - } - - println!( - "🔁 Downloading latest layers to destination pageserver {}", - dest_ps.conf.id - ); - match dest_ps - .tenant_secondary_download(&TenantShardId::unsharded(tenant_id)) - .await - { - Ok(()) => {} - Err(_) => { - println!(" (skipping, destination wasn't in secondary mode)") - } - } - - let gen = attachment_service - .attach_hook(tenant_id, dest_ps.conf.id) - .await?; - let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None); - - println!("🔁 Attaching to pageserver {}", dest_ps.conf.id); - dest_ps.location_config(tenant_id, dest_conf, None).await?; - - if let Some(baseline) = baseline_lsns { - println!("🕑 Waiting for LSN to catch up..."); - await_lsn(tenant_id, &dest_ps, baseline).await?; - } - - let cplane = ComputeControlPlane::load(env.clone())?; - for (endpoint_name, endpoint) in &cplane.endpoints { - if endpoint.tenant_id == tenant_id { - println!( - "🔁 Reconfiguring endpoint {} to use pageserver {}", - endpoint_name, dest_ps.conf.id - ); - endpoint.reconfigure(Some(dest_ps.conf.id)).await?; - } - } - - for other_ps_conf in &env.pageservers { - if other_ps_conf.id == dest_ps.conf.id { - continue; - } - - let other_ps = PageServerNode::from_env(env, other_ps_conf); - let other_ps_tenants = other_ps.tenant_list().await?; - - // Check if this tenant is attached - let found = other_ps_tenants - .into_iter() - .map(|t| t.id) - .any(|i| i.tenant_id == tenant_id); - if !found { - continue; - } - - // Downgrade to a secondary location - let secondary_conf = build_location_config( - LocationConfigMode::Secondary, - None, - Some(LocationConfigSecondary { warm: true }), - ); - - println!( - "💤 Switching to secondary mode on pageserver {}", - other_ps.conf.id - ); - other_ps - .location_config(tenant_id, secondary_conf, None) - .await?; - } - - println!( - "🔁 Switching to AttachedSingle mode on pageserver {}", - dest_ps.conf.id - ); - let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None); - dest_ps.location_config(tenant_id, dest_conf, None).await?; - - println!("✅ Migration complete"); - - Ok(()) -} diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml new file mode 100644 index 0000000000..f96f0084b2 --- /dev/null +++ b/control_plane/storcon_cli/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "storcon_cli" +version = "0.1.0" +edition.workspace = true +license.workspace = true + + +[dependencies] +anyhow.workspace = true +clap.workspace = true +comfy-table.workspace = true +futures.workspace = true +humantime.workspace = true +hyper.workspace = true +pageserver_api.workspace = true +pageserver_client.workspace = true +reqwest.workspace = true +serde.workspace = true +serde_json = { workspace = true, features = ["raw_value"] } +thiserror.workspace = true +tokio.workspace = true +tracing.workspace = true +utils.workspace = true +workspace_hack.workspace = true + diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs new file mode 100644 index 0000000000..7b48b75c21 --- /dev/null +++ b/control_plane/storcon_cli/src/main.rs @@ -0,0 +1,948 @@ +use futures::StreamExt; +use std::{collections::HashMap, str::FromStr, time::Duration}; + +use clap::{Parser, Subcommand}; +use pageserver_api::{ + controller_api::{ + NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, + TenantDescribeResponse, TenantPolicyRequest, + }, + models::{ + EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, + ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest, + TenantShardSplitRequest, TenantShardSplitResponse, + }, + shard::{ShardStripeSize, TenantShardId}, +}; +use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use reqwest::{Method, StatusCode, Url}; +use serde::{de::DeserializeOwned, Serialize}; +use utils::id::{NodeId, TenantId}; + +use pageserver_api::controller_api::{ + NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, + TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse, +}; + +#[derive(Subcommand, Debug)] +enum Command { + /// Register a pageserver with the storage controller. This shouldn't usually be necessary, + /// since pageservers auto-register when they start up + NodeRegister { + #[arg(long)] + node_id: NodeId, + + #[arg(long)] + listen_pg_addr: String, + #[arg(long)] + listen_pg_port: u16, + + #[arg(long)] + listen_http_addr: String, + #[arg(long)] + listen_http_port: u16, + }, + + /// Modify a node's configuration in the storage controller + NodeConfigure { + #[arg(long)] + node_id: NodeId, + + /// Availability is usually auto-detected based on heartbeats. Set 'offline' here to + /// manually mark a node offline + #[arg(long)] + availability: Option, + /// Scheduling policy controls whether tenant shards may be scheduled onto this node. + #[arg(long)] + scheduling: Option, + }, + /// Modify a tenant's policies in the storage controller + TenantPolicy { + #[arg(long)] + tenant_id: TenantId, + /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`), + /// or is in the normal attached state with N secondary locations (`attached:N`) + #[arg(long)] + placement: Option, + /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal, + /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents + /// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant + /// unavailable, and are only for use in emergencies. + #[arg(long)] + scheduling: Option, + }, + /// List nodes known to the storage controller + Nodes {}, + /// List tenants known to the storage controller + Tenants {}, + /// Create a new tenant in the storage controller, and by extension on pageservers. + TenantCreate { + #[arg(long)] + tenant_id: TenantId, + }, + /// Delete a tenant in the storage controller, and by extension on pageservers. + TenantDelete { + #[arg(long)] + tenant_id: TenantId, + }, + /// Split an existing tenant into a higher number of shards than its current shard count. + TenantShardSplit { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + shard_count: u8, + /// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. + #[arg(long)] + stripe_size: Option, + }, + /// Migrate the attached location for a tenant shard to a specific pageserver. + TenantShardMigrate { + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + node: NodeId, + }, + /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure + /// that is passed through to pageservers, and does not affect storage controller behavior. + TenantConfig { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + config: String, + }, + /// Attempt to balance the locations for a tenant across pageservers. This is a client-side + /// alternative to the storage controller's scheduling optimization behavior. + TenantScatter { + #[arg(long)] + tenant_id: TenantId, + }, + /// Print details about a particular tenant, including all its shards' states. + TenantDescribe { + #[arg(long)] + tenant_id: TenantId, + }, + /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary + /// mode so that it can warm up content on a pageserver. + TenantWarmup { + #[arg(long)] + tenant_id: TenantId, + }, + /// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate + /// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region. + TenantDrop { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + unclean: bool, + }, + NodeDrop { + #[arg(long)] + node_id: NodeId, + #[arg(long)] + unclean: bool, + }, + TenantSetTimeBasedEviction { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + period: humantime::Duration, + #[arg(long)] + threshold: humantime::Duration, + }, + // Drain a set of specified pageservers by moving the primary attachments to pageservers + // outside of the specified set. + Drain { + // Set of pageserver node ids to drain. + #[arg(long)] + nodes: Vec, + // Optional: migration concurrency (default is 8) + #[arg(long)] + concurrency: Option, + // Optional: maximum number of shards to migrate + #[arg(long)] + max_shards: Option, + // Optional: when set to true, nothing is migrated, but the plan is printed to stdout + #[arg(long)] + dry_run: Option, + }, +} + +#[derive(Parser)] +#[command( + author, + version, + about, + long_about = "CLI for Storage Controller Support/Debug" +)] +#[command(arg_required_else_help(true))] +struct Cli { + #[arg(long)] + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + api: Url, + + #[arg(long)] + /// JWT token for authenticating with storage controller. Depending on the API used, this + /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint + /// a token with both scopes to use with this tool. + jwt: Option, + + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Clone)] +struct PlacementPolicyArg(PlacementPolicy); + +impl FromStr for PlacementPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "detached" => Ok(Self(PlacementPolicy::Detached)), + "secondary" => Ok(Self(PlacementPolicy::Secondary)), + _ if s.starts_with("attached:") => { + let mut splitter = s.split(':'); + let _prefix = splitter.next().unwrap(); + match splitter.next().and_then(|s| s.parse::().ok()) { + Some(n) => Ok(Self(PlacementPolicy::Attached(n))), + None => Err(anyhow::anyhow!( + "Invalid format '{s}', a valid example is 'attached:1'" + )), + } + } + _ => Err(anyhow::anyhow!( + "Unknown placement policy '{s}', try detached,secondary,attached:" + )), + } + } +} + +#[derive(Debug, Clone)] +struct ShardSchedulingPolicyArg(ShardSchedulingPolicy); + +impl FromStr for ShardSchedulingPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(ShardSchedulingPolicy::Active)), + "essential" => Ok(Self(ShardSchedulingPolicy::Essential)), + "pause" => Ok(Self(ShardSchedulingPolicy::Pause)), + "stop" => Ok(Self(ShardSchedulingPolicy::Stop)), + _ => Err(anyhow::anyhow!( + "Unknown scheduling policy '{s}', try active,essential,pause,stop" + )), + } + } +} + +#[derive(Debug, Clone)] +struct NodeAvailabilityArg(NodeAvailabilityWrapper); + +impl FromStr for NodeAvailabilityArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(NodeAvailabilityWrapper::Active)), + "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)), + _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), + } + } +} + +struct Client { + base_url: Url, + jwt_token: Option, + client: reqwest::Client, +} + +impl Client { + fn new(base_url: Url, jwt_token: Option) -> Self { + Self { + base_url, + jwt_token, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch( + &self, + method: Method, + path: String, + body: Option, + ) -> mgmt_api::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + self.base_url.host_str().unwrap(), + self.base_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(jwt_token) = &self.jwt_token { + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + + let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; + let response = response.error_from_body().await?; + + response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) + } +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + + let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone()); + + let mut trimmed = cli.api.to_string(); + trimmed.pop(); + let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref()); + + match cli.command { + Command::NodeRegister { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + } => { + storcon_client + .dispatch::<_, ()>( + Method::POST, + "control/v1/node".to_string(), + Some(NodeRegisterRequest { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + }), + ) + .await?; + } + Command::TenantCreate { tenant_id } => { + vps_client + .tenant_create(&TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation: None, + shard_parameters: ShardParameters::default(), + placement_policy: Some(PlacementPolicy::Attached(1)), + config: TenantConfig::default(), + }) + .await?; + } + Command::TenantDelete { tenant_id } => { + let status = vps_client + .tenant_delete(TenantShardId::unsharded(tenant_id)) + .await?; + tracing::info!("Delete status: {}", status); + } + Command::Nodes {} => { + let resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + let mut table = comfy_table::Table::new(); + table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); + for node in resp { + table.add_row([ + format!("{}", node.id), + node.listen_http_addr, + format!("{:?}", node.scheduling), + format!("{:?}", node.availability), + ]); + } + println!("{table}"); + } + Command::NodeConfigure { + node_id, + availability, + scheduling, + } => { + let req = NodeConfigureRequest { + node_id, + availability: availability.map(|a| a.0), + scheduling, + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{node_id}/config"), + Some(req), + ) + .await?; + } + Command::Tenants {} => { + let resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/tenant".to_string(), + None, + ) + .await?; + let mut table = comfy_table::Table::new(); + table.set_header([ + "TenantId", + "ShardCount", + "StripeSize", + "Placement", + "Scheduling", + ]); + for tenant in resp { + let shard_zero = tenant.shards.into_iter().next().unwrap(); + table.add_row([ + format!("{}", tenant.tenant_id), + format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), + format!("{:?}", tenant.stripe_size), + format!("{:?}", tenant.policy), + format!("{:?}", shard_zero.scheduling_policy), + ]); + } + + println!("{table}"); + } + Command::TenantPolicy { + tenant_id, + placement, + scheduling, + } => { + let req = TenantPolicyRequest { + scheduling: scheduling.map(|s| s.0), + placement: placement.map(|p| p.0), + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/policy"), + Some(req), + ) + .await?; + } + Command::TenantShardSplit { + tenant_id, + shard_count, + stripe_size, + } => { + let req = TenantShardSplitRequest { + new_shard_count: shard_count, + new_stripe_size: stripe_size.map(ShardStripeSize), + }; + + let response = storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/shard_split"), + Some(req), + ) + .await?; + println!( + "Split tenant {} into {} shards: {}", + tenant_id, + shard_count, + response + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + } + Command::TenantShardMigrate { + tenant_shard_id, + node, + } => { + let req = TenantShardMigrateRequest { + tenant_shard_id, + node_id: node, + }; + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate"), + Some(req), + ) + .await?; + } + Command::TenantConfig { tenant_id, config } => { + let tenant_conf = serde_json::from_str(&config)?; + + vps_client + .tenant_config(&TenantConfigRequest { + tenant_id, + config: tenant_conf, + }) + .await?; + } + Command::TenantScatter { tenant_id } => { + // Find the shards + let locate_response = storcon_client + .dispatch::<(), TenantLocateResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}/locate"), + None, + ) + .await?; + let shards = locate_response.shards; + + let mut node_to_shards: HashMap> = HashMap::new(); + let shard_count = shards.len(); + for s in shards { + let entry = node_to_shards.entry(s.node_id).or_default(); + entry.push(s.shard_id); + } + + // Load list of available nodes + let nodes_resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + for node in nodes_resp { + if matches!(node.availability, NodeAvailabilityWrapper::Active) { + node_to_shards.entry(node.id).or_default(); + } + } + + let max_shard_per_node = shard_count / node_to_shards.len(); + + loop { + let mut migrate_shard = None; + for shards in node_to_shards.values_mut() { + if shards.len() > max_shard_per_node { + // Pick the emptiest + migrate_shard = Some(shards.pop().unwrap()); + } + } + let Some(migrate_shard) = migrate_shard else { + break; + }; + + // Pick the emptiest node to migrate to + let mut destinations = node_to_shards + .iter() + .map(|(k, v)| (k, v.len())) + .collect::>(); + destinations.sort_by_key(|i| i.1); + let (destination_node, destination_count) = *destinations.first().unwrap(); + if destination_count + 1 > max_shard_per_node { + // Even the emptiest destination doesn't have space: we're done + break; + } + let destination_node = *destination_node; + + node_to_shards + .get_mut(&destination_node) + .unwrap() + .push(migrate_shard); + + println!("Migrate {} -> {} ...", migrate_shard, destination_node); + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{migrate_shard}/migrate"), + Some(TenantShardMigrateRequest { + tenant_shard_id: migrate_shard, + node_id: destination_node, + }), + ) + .await?; + println!("Migrate {} -> {} OK", migrate_shard, destination_node); + } + + // Spread the shards across the nodes + } + Command::TenantDescribe { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + let shards = describe_response.shards; + let mut table = comfy_table::Table::new(); + table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); + for shard in shards { + let secondary = shard + .node_secondary + .iter() + .map(|n| format!("{}", n)) + .collect::>() + .join(","); + + let mut status_parts = Vec::new(); + if shard.is_reconciling { + status_parts.push("reconciling"); + } + + if shard.is_pending_compute_notification { + status_parts.push("pending_compute"); + } + + if shard.is_splitting { + status_parts.push("splitting"); + } + let status = status_parts.join(","); + + table.add_row([ + format!("{}", shard.tenant_shard_id), + shard + .node_attached + .map(|n| format!("{}", n)) + .unwrap_or(String::new()), + secondary, + shard.last_error, + status, + ]); + } + println!("{table}"); + } + Command::TenantWarmup { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await; + match describe_response { + Ok(describe) => { + if matches!(describe.policy, PlacementPolicy::Secondary) { + // Fine: it's already known to controller in secondary mode: calling + // again to put it into secondary mode won't cause problems. + } else { + anyhow::bail!("Tenant already present with policy {:?}", describe.policy); + } + } + Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => { + // Fine: this tenant isn't know to the storage controller yet. + } + Err(e) => { + // Unexpected API error + return Err(e.into()); + } + } + + vps_client + .location_config( + TenantShardId::unsharded(tenant_id), + pageserver_api::models::LocationConfig { + mode: pageserver_api::models::LocationConfigMode::Secondary, + generation: None, + secondary_conf: Some(LocationConfigSecondary { warm: true }), + shard_number: 0, + shard_count: 0, + shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0, + tenant_conf: TenantConfig::default(), + }, + None, + true, + ) + .await?; + + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + + let secondary_ps_id = describe_response + .shards + .first() + .unwrap() + .node_secondary + .first() + .unwrap(); + + println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}"); + loop { + let (status, progress) = vps_client + .tenant_secondary_download( + TenantShardId::unsharded(tenant_id), + Some(Duration::from_secs(10)), + ) + .await?; + println!( + "Progress: {}/{} layers, {}/{} bytes", + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + match status { + StatusCode::OK => { + println!("Download complete"); + break; + } + StatusCode::ACCEPTED => { + // Loop + } + _ => { + anyhow::bail!("Unexpected download status: {status}"); + } + } + } + } + Command::TenantDrop { tenant_id, unclean } => { + if !unclean { + anyhow::bail!("This command is not a tenant deletion, and uncleanly drops all controller state for the tenant. If you know what you're doing, add `--unclean` to proceed.") + } + storcon_client + .dispatch::<(), ()>( + Method::POST, + format!("debug/v1/tenant/{tenant_id}/drop"), + None, + ) + .await?; + } + Command::NodeDrop { node_id, unclean } => { + if !unclean { + anyhow::bail!("This command is not a clean node decommission, and uncleanly drops all controller state for the node, without checking if any tenants still refer to it. If you know what you're doing, add `--unclean` to proceed.") + } + storcon_client + .dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None) + .await?; + } + Command::TenantSetTimeBasedEviction { + tenant_id, + period, + threshold, + } => { + vps_client + .tenant_config(&TenantConfigRequest { + tenant_id, + config: TenantConfig { + eviction_policy: Some(EvictionPolicy::LayerAccessThreshold( + EvictionPolicyLayerAccessThreshold { + period: period.into(), + threshold: threshold.into(), + }, + )), + ..Default::default() + }, + }) + .await?; + } + Command::Drain { + nodes, + concurrency, + max_shards, + dry_run, + } => { + // Load the list of nodes, split them up into the drained and filled sets, + // and validate that draining is possible. + let node_descs = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + let mut node_to_drain_descs = Vec::new(); + let mut node_to_fill_descs = Vec::new(); + + for desc in node_descs { + let to_drain = nodes.iter().any(|id| *id == desc.id); + if to_drain { + node_to_drain_descs.push(desc); + } else { + node_to_fill_descs.push(desc); + } + } + + if nodes.len() != node_to_drain_descs.len() { + anyhow::bail!("Drain requested for node which doesn't exist.") + } + + node_to_fill_descs.retain(|desc| { + matches!(desc.availability, NodeAvailabilityWrapper::Active) + && matches!( + desc.scheduling, + NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling + ) + }); + + if node_to_fill_descs.is_empty() { + anyhow::bail!("There are no nodes to drain to") + } + + // Set the node scheduling policy to draining for the nodes which + // we plan to drain. + for node_desc in node_to_drain_descs.iter() { + let req = NodeConfigureRequest { + node_id: node_desc.id, + availability: None, + scheduling: Some(NodeSchedulingPolicy::Draining), + }; + + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{}/config", node_desc.id), + Some(req), + ) + .await?; + } + + // Perform the drain: move each tenant shard scheduled on a node to + // be drained to a node which is being filled. A simple round robin + // strategy is used to pick the new node. + let tenants = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/tenant".to_string(), + None, + ) + .await?; + + let mut selected_node_idx = 0; + + struct DrainMove { + tenant_shard_id: TenantShardId, + from: NodeId, + to: NodeId, + } + + let mut moves: Vec = Vec::new(); + + let shards = tenants + .into_iter() + .flat_map(|tenant| tenant.shards.into_iter()); + for shard in shards { + if let Some(max_shards) = max_shards { + if moves.len() >= max_shards { + println!( + "Stop planning shard moves since the requested maximum was reached" + ); + break; + } + } + + let should_migrate = { + if let Some(attached_to) = shard.node_attached { + node_to_drain_descs + .iter() + .map(|desc| desc.id) + .any(|id| id == attached_to) + } else { + false + } + }; + + if !should_migrate { + continue; + } + + moves.push(DrainMove { + tenant_shard_id: shard.tenant_shard_id, + from: shard + .node_attached + .expect("We only migrate attached tenant shards"), + to: node_to_fill_descs[selected_node_idx].id, + }); + selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len(); + } + + let total_moves = moves.len(); + + if dry_run == Some(true) { + println!("Dryrun requested. Planned {total_moves} moves:"); + for mv in &moves { + println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to) + } + + return Ok(()); + } + + const DEFAULT_MIGRATE_CONCURRENCY: usize = 8; + let mut stream = futures::stream::iter(moves) + .map(|mv| { + let client = Client::new(cli.api.clone(), cli.jwt.clone()); + async move { + client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), + Some(TenantShardMigrateRequest { + tenant_shard_id: mv.tenant_shard_id, + node_id: mv.to, + }), + ) + .await + .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e)) + } + }) + .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY)); + + let mut success = 0; + let mut failure = 0; + + while let Some(res) = stream.next().await { + match res { + Ok(_) => { + success += 1; + } + Err((tenant_shard_id, from, to, error)) => { + failure += 1; + println!( + "Failed to migrate {} from node {} to node {}: {}", + tenant_shard_id, from, to, error + ); + } + } + + if (success + failure) % 20 == 0 { + println!( + "Processed {}/{} shards: {} succeeded, {} failed", + success + failure, + total_moves, + success, + failure + ); + } + } + + println!( + "Processed {}/{} shards: {} succeeded, {} failed", + success + failure, + total_moves, + success, + failure + ); + } + } + + Ok(()) +} diff --git a/deny.toml b/deny.toml index 22e39a2ca3..469609c496 100644 --- a/deny.toml +++ b/deny.toml @@ -99,6 +99,13 @@ name = "async-executor" [[bans.deny]] name = "smol" +[[bans.deny]] +# We want to use rustls instead of the platform's native tls implementation. +name = "native-tls" + +[[bans.deny]] +name = "openssl" + # This section is considered when running `cargo deny check sources`. # More documentation about the 'sources' section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html diff --git a/diesel.toml b/diesel.toml new file mode 100644 index 0000000000..558c54a1e1 --- /dev/null +++ b/diesel.toml @@ -0,0 +1,9 @@ +# For documentation on how to configure this file, +# see https://diesel.rs/guides/configuring-diesel-cli + +[print_schema] +file = "storage_controller/src/schema.rs" +custom_type_derives = ["diesel::query_builder::QueryId"] + +[migrations_directory] +dir = "storage_controller/migrations" diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index f1b1986072..8378f37b48 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -1,4 +1,4 @@ -ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com +ARG REPOSITORY=neondatabase ARG COMPUTE_IMAGE=compute-node-v14 ARG TAG=latest @@ -8,6 +8,11 @@ USER root RUN apt-get update && \ apt-get install -y curl \ jq \ + python3-pip \ netcat +#Faker is required for the pg_anon test +RUN pip3 install Faker +#This is required for the pg_hintplan test +RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src -USER postgres +USER postgres \ No newline at end of file diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json index ccf0a91b90..8e582e74e1 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -95,7 +95,7 @@ }, { "name": "shared_preload_libraries", - "value": "neon", + "value": "neon,pg_cron,timescaledb,pg_stat_statements", "vartype": "string" }, { @@ -127,6 +127,16 @@ "name": "max_replication_flush_lag", "value": "10GB", "vartype": "string" + }, + { + "name": "cron.database", + "value": "postgres", + "vartype": "string" + }, + { + "name": "session_preload_libraries", + "value": "anon", + "vartype": "string" } ] }, diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 9777d1fdd2..5503b6611a 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3' - services: minio: restart: always @@ -161,12 +159,12 @@ services: context: ./compute_wrapper/ args: - REPOSITORY=${REPOSITORY:-neondatabase} - - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14} + - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16} - TAG=${TAG:-latest} - http_proxy=$http_proxy - https_proxy=$https_proxy environment: - - PG_VERSION=${PG_VERSION:-14} + - PG_VERSION=${PG_VERSION:-16} #- RUST_BACKTRACE=1 # Mount the test files directly, for faster editing cycle. volumes: @@ -194,3 +192,14 @@ services: done" depends_on: - compute + + neon-test-extensions: + profiles: ["test-extensions"] + image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest} + entrypoint: + - "/bin/bash" + - "-c" + command: + - sleep 1800 + depends_on: + - compute diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index e18b0f9176..a00591afd0 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -7,54 +7,94 @@ # Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file # Their defaults point at DockerHub `neondatabase/neon:latest` image.`, # to verify custom image builds (e.g pre-published ones). - -# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer. - +# +# A test script for postgres extensions +# Currently supports only v16 +# set -eux -o pipefail -SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml - +COMPOSE_FILE='docker-compose.yml' +cd $(dirname $0) COMPUTE_CONTAINER_NAME=docker-compose-compute-1 -SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;" -PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres" +TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1 +PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres" +: ${http_proxy:=} +: ${https_proxy:=} +export http_proxy https_proxy cleanup() { echo "show container information" docker ps - docker compose -f $COMPOSE_FILE logs + docker compose --profile test-extensions -f $COMPOSE_FILE logs echo "stop containers..." - docker compose -f $COMPOSE_FILE down + docker compose --profile test-extensions -f $COMPOSE_FILE down } -echo "clean up containers if exists" -cleanup - for pg_version in 14 15 16; do - echo "start containers (pg_version=$pg_version)." - PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d + echo "clean up containers if exists" + cleanup + PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version)) + PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d echo "wait until the compute is ready. timeout after 60s. " cnt=0 - while sleep 1; do + while sleep 3; do # check timeout - cnt=`expr $cnt + 1` + cnt=`expr $cnt + 3` if [ $cnt -gt 60 ]; then echo "timeout before the compute is ready." cleanup exit 1 fi - - # check if the compute is ready - set +o pipefail - result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l` - set -o pipefail - if [ $result -eq 1 ]; then + if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then echo "OK. The compute is ready to connect." echo "execute simple queries." docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION" - cleanup break fi done + + if [ $pg_version -ge 16 ] + then + echo Enabling trust connection + docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' " + echo Adding postgres role + docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN" + # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail + # It cannot be moved to Dockerfile now because the database directory is created after the start of the container + echo Adding dummy config + docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf + # This block is required for the pg_anon extension test. + # The test assumes that it is running on the same host with the postgres engine. + # In our case it's not true, that's why we are copying files to the compute node + TMPDIR=$(mktemp -d) + docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data + echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data + rm -rf $TMPDIR + TMPDIR=$(mktemp -d) + # The following block does the same for the pg_hintplan test + docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ + rm -rf $TMPDIR + # We are running tests now + if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ + $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt + then + cleanup + else + FAILED=$(tail -1 testout.txt) + for d in $FAILED + do + mkdir $d + docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true + docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true + cat $d/regression.out $d/regression.diffs || true + done + rm -rf $FAILED + cleanup + exit 1 + fi + fi + cleanup done diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh new file mode 100644 index 0000000000..c05fc159aa --- /dev/null +++ b/docker-compose/run-tests.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -x + +cd /ext-src +FAILED= +LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u) +for d in ${LIST} +do + [ -d ${d} ] || continue + psql -c "select 1" >/dev/null || break + make -C ${d} installcheck || FAILED="${d} ${FAILED}" +done +[ -z "${FAILED}" ] && exit 0 +echo ${FAILED} +exit 1 \ No newline at end of file diff --git a/docs/authentication.md b/docs/authentication.md index f768b04c5b..522c5481b4 100644 --- a/docs/authentication.md +++ b/docs/authentication.md @@ -70,6 +70,9 @@ Should only be used e.g. for status check/tenant creation/list. Should only be used e.g. for status check. Currently also used for connection from any pageserver to any safekeeper. +"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane. + +"admin": Provides access to the control plane and admin APIs of the storage controller. ### CLI CLI generates a key pair during call to `neon_local init` with the following commands: diff --git a/docs/core_changes.md b/docs/core_changes.md index ea219adae9..1388317728 100644 --- a/docs/core_changes.md +++ b/docs/core_changes.md @@ -11,15 +11,28 @@ page server. We currently use the same binary for both, with --wal-redo runtime the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for the WAL redo process. -In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the -smgr interface. Once all the core changes have been submitted to upstream or eliminated some other -way, the extension could live outside the postgres repository and build against vanilla PostgreSQL. +In addition to core PostgreSQL changes, there is a Neon extension in the pgxn/neon directory that +hooks into the smgr interface, and rmgr extension in pgxn/neon_rmgr. The extensions are loaded into +the Postgres processes with shared_preload_libraries. Most of the Neon-specific code is in the +extensions, and for any new features, that is preferred over modifying core PostgreSQL code. Below is a list of all the PostgreSQL source code changes, categorized into changes needed for compute, and changes needed for the WAL redo process: # Changes for Compute node +## Prefetching + +There are changes in many places to perform prefetching, for example for sequential scans. Neon +doesn't benefit from OS readahead, and the latency to pageservers is quite high compared to local +disk, so prefetching is critical for performance, also for sequential scans. + +### How to get rid of the patch + +Upcoming "streaming read" work in v17 might simplify this. And async I/O work in v18 will hopefully +do more. + + ## Add t_cid to heap WAL records ``` @@ -37,54 +50,11 @@ The problem is that the XLOG_HEAP_INSERT record does not include the command id Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information. +Update from Heikki (2024-04-17): I tried to write an upstream patch for that, to use the t_cid field for logical decoding, but it was not as straightforward as it first sounded. ### Alternatives Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated. -## ginfast.c - -``` -diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c -index e0d9940946..2d964c02e9 100644 ---- a/src/backend/access/gin/ginfast.c -+++ b/src/backend/access/gin/ginfast.c -@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) - memset(&sublist, 0, sizeof(GinMetaPageData)); - makeSublist(index, collector->tuples, collector->ntuples, &sublist); - -+ if (metadata->head != InvalidBlockNumber) -+ { -+ /* -+ * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call -+ * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from -+ * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write() -+ * will try to WAL-log an image of the page. -+ */ -+ buffer = ReadBuffer(index, metadata->tail); -+ } -+ - if (needWal) - XLogBeginInsert(); - -@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) - data.prevTail = metadata->tail; - data.newRightlink = sublist.head; - -- buffer = ReadBuffer(index, metadata->tail); - LockBuffer(buffer, GIN_EXCLUSIVE); - page = BufferGetPage(buffer); -``` - -The problem is explained in the comment above - -### How to get rid of the patch - -Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical -section or something. - -Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images? - - ## Mark index builds that use buffer manager without logging explicitly ``` @@ -95,6 +65,8 @@ Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and also some changes in src/backend/storage/smgr/smgr.c ``` +pgvector 0.6.0 also needs a similar change, which would be very nice to get rid of too. + When a GIN index is built, for example, it is built by inserting the entries into the index more or less normally, but without WAL-logging anything. After the index has been built, we iterate through all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged @@ -109,6 +81,10 @@ an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1` I think it would make sense to be more explicit about that in PostgreSQL too. So extract these changes to a patch and post to pgsql-hackers. +Perhaps we could deduce that an unlogged index build has started when we see a page being evicted +with zero LSN. How to be sure it's an unlogged index build rather than a bug? Currently we have a +check for that and PANIC if we see page with zero LSN being evicted. And how do we detect when the +index build has finished? See https://github.com/neondatabase/neon/pull/7440 for an attempt at that. ## Track last-written page LSN @@ -140,57 +116,6 @@ The old method is still available, though. Wait until v15? -## Cache relation sizes - -The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going -to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the -relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for -Neon) - - -## Use buffer manager when extending VM or FSM - -``` - src/backend/storage/freespace/freespace.c | 14 +- - src/backend/access/heap/visibilitymap.c | 15 +- - -diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c -index e198df65d8..addfe93eac 100644 ---- a/src/backend/access/heap/visibilitymap.c -+++ b/src/backend/access/heap/visibilitymap.c -@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) - /* Now extend the file */ - while (vm_nblocks_now < vm_nblocks) - { -- PageSetChecksumInplace((Page) pg.data, vm_nblocks_now); -+ /* -+ * ZENITH: Initialize VM pages through buffer cache to prevent loading -+ * them from pageserver. -+ */ -+ Buffer buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW, -+ RBM_ZERO_AND_LOCK, NULL); -+ Page page = BufferGetPage(buffer); -+ -+ PageInit((Page) page, BLCKSZ, 0); -+ PageSetChecksumInplace(page, vm_nblocks_now); -+ MarkBufferDirty(buffer); -+ UnlockReleaseBuffer(buffer); - -- smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, -- pg.data, false); - vm_nblocks_now++; - } -``` - -### Problem we're trying to solve - -??? - -### How to get rid of the patch - -Maybe this would be a reasonable change in PostgreSQL too? - - ## Allow startup without reading checkpoint record In Neon, the compute node is stateless. So when we are launching compute node, we need to provide @@ -231,7 +156,7 @@ index 0415df9ccb..9f9db3c8bc 100644 * crash we can lose (skip over) as many values as we pre-logged. */ -#define SEQ_LOG_VALS 32 -+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ ++/* Neon XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ +/* #define SEQ_LOG_VALS 32 */ +#define SEQ_LOG_VALS 0 ``` @@ -250,66 +175,6 @@ would be weird if the sequence moved backwards though, think of PITR. Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon. -## Walproposer - -``` - src/Makefile | 1 + - src/backend/replication/libpqwalproposer/Makefile | 37 + - src/backend/replication/libpqwalproposer/libpqwalproposer.c | 416 ++++++++++++ - src/backend/postmaster/bgworker.c | 4 + - src/backend/postmaster/postmaster.c | 6 + - src/backend/replication/Makefile | 4 +- - src/backend/replication/walproposer.c | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - src/backend/replication/walproposer_utils.c | 402 +++++++++++ - src/backend/replication/walreceiver.c | 7 + - src/backend/replication/walsender.c | 320 ++++++--- - src/backend/storage/ipc/ipci.c | 6 + - src/include/replication/walproposer.h | 565 ++++++++++++++++ -``` - -WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes. It is -currently implemented as patch to standard WAL sender. - -### How to get rid of the patch - -Refactor into an extension. Submit hooks or APIs into upstream if necessary. - -@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96 - -## Ignore unexpected data beyond EOF in bufmgr.c - -``` -@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, - */ - bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); - if (!PageIsNew((Page) bufBlock)) -- ereport(ERROR, -+ { -+ // XXX-ZENITH -+ MemSet((char *) bufBlock, 0, BLCKSZ); -+ ereport(DEBUG1, - (errmsg("unexpected data beyond EOF in block %u of relation %s", - blockNum, relpath(smgr->smgr_rnode, forkNum)), - errhint("This has been seen to occur with buggy kernels; consider updating your system."))); -- -+ } - /* - * We *must* do smgrextend before succeeding, else the page will not - * be reserved by the kernel, and the next P_NEW call will decide to -``` - -PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros -first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend -a relation at the same time, the pages can be WAL-logged in different order. - -I'm not sure what scenario exactly required this change in Neon, though. - -### How to get rid of the patch - -Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit -confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation, -and finally WAL-log that the extension succeeded. - ## Make smgr interface available to extensions ``` @@ -321,6 +186,8 @@ and finally WAL-log that the extension succeeded. Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression. +We have submitted this to upstream, but it's moving at glacial a speed. +https://commitfest.postgresql.org/47/4428/ ## Added relpersistence argument to smgropen() @@ -444,6 +311,148 @@ Ignore it. This is only needed for disaster recovery, so once we've eliminated a patches, we can just keep it around as a patch or as separate branch in a repo. +## pg_waldump flags to ignore errors + +After creating a new project or branch in Neon, the first timeline can begin in the middle of a WAL segment. pg_waldump chokes on that, so we added some flags to make it possible to ignore errors. + +### How to get rid of the patch + +Like previous one, ignore it. + + + +## Backpressure if pageserver doesn't ingest WAL fast enough + +``` +@@ -3200,6 +3202,7 @@ ProcessInterrupts(void) + return; + InterruptPending = false; + ++retry: + if (ProcDiePending) + { + ProcDiePending = false; +@@ -3447,6 +3450,13 @@ ProcessInterrupts(void) + + if (ParallelApplyMessagePending) + HandleParallelApplyMessages(); ++ ++ /* Call registered callback if any */ ++ if (ProcessInterruptsCallback) ++ { ++ if (ProcessInterruptsCallback()) ++ goto retry; ++ } + } +``` + + +### How to get rid of the patch + +Submit a patch to upstream, for a hook in ProcessInterrupts. Could be useful for other extensions +too. + + +## SLRU on-demand download + +``` + src/backend/access/transam/slru.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 92 insertions(+), 13 deletions(-) +``` + +### Problem we're trying to solve + +Previously, SLRU files were included in the basebackup, but the total size of them can be large, +several GB, and downloading them all made the startup time too long. + +### Alternatives + +FUSE hook or LD_PRELOAD trick to intercept the reads on SLRU files + + +## WAL-log an all-zeros page as one large hole + +- In XLogRecordAssemble() + +### Problem we're trying to solve + +This change was made in v16. Starting with v16, when PostgreSQL extends a relation, it first extends +it with zeros, and it can extend the relation more than one block at a time. The all-zeros page is WAL-ogged, but it's very wasteful to include 8 kB of zeros in the WAL for that. This hack was made so that we WAL logged a compact record with a whole-page "hole". However, PostgreSQL has assertions that prevent that such WAL records from being replayed, so this breaks compatibility such that unmodified PostreSQL cannot process Neon-generated WAL. + +### How to get rid of the patch + +Find another compact representation for a full-page image of an all-zeros page. A compressed image perhaps. + + +## Shut down walproposer after checkpointer + +``` ++ /* Neon: Also allow walproposer background worker to be treated like a WAL sender, so that it's shut down last */ ++ if ((bp->bkend_type == BACKEND_TYPE_NORMAL || bp->bkend_type == BACKEND_TYPE_BGWORKER) && +``` + +This changes was needed so that postmaster shuts down the walproposer process only after the shutdown checkpoint record is written. Otherwise, the shutdown record will never make it to the safekeepers. + +### How to get rid of the patch + +Do a bigger refactoring of the postmaster state machine, such that a background worker can specify +the shutdown ordering by itself. The postmaster state machine has grown pretty complicated, and +would benefit from a refactoring for the sake of readability anyway. + + +## EXPLAIN changes for prefetch and LFC + +### How to get rid of the patch + +Konstantin submitted a patch to -hackers already: https://commitfest.postgresql.org/47/4643/. Get that into a committable state. + + +## On-demand download of extensions + +### How to get rid of the patch + +FUSE or LD_PRELOAD trickery to intercept reads? + + +## Publication superuser checks + +We have hacked CreatePublication so that also neon_superuser can create them. + +### How to get rid of the patch + +Create an upstream patch with more fine-grained privileges for publications CREATE/DROP that can be GRANTed to users. + + +## WAL log replication slots + +### How to get rid of the patch + +Utilize the upcoming v17 "slot sync worker", or a similar neon-specific background worker process, to periodically WAL-log the slots, or to export them somewhere else. + + +## WAL-log replication snapshots + +### How to get rid of the patch + +WAL-log them periodically, from a backgound worker. + + +## WAL-log relmapper files + +Similarly to replications snapshot files, the CID mapping files generated during VACUUM FULL of a catalog table are WAL-logged + +### How to get rid of the patch + +WAL-log them periodically, from a backgound worker. + + +## XLogWaitForReplayOf() + +?? + + + + # Not currently committed but proposed ## Disable ring buffer buffer manager strategies @@ -472,23 +481,10 @@ hint bits are set. Wal logging hint bits updates requires FPI which significantl Add special WAL record for setting page hints. -## Prefetching - -### Why? - -As far as pages in Neon are loaded on demand, to reduce node startup time -and also speedup some massive queries we need some mechanism for bulk loading to -reduce page request round-trip overhead. - -Currently Postgres is supporting prefetching only for bitmap scan. -In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us. -For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages -of heap relation addressed by TIDs. - ## Prewarming ### Why? -Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith. +Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Neon. But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow. We can capture state of compute node buffer cache and send bulk request for this pages at startup. diff --git a/docs/docker.md b/docs/docker.md index 9761cc4346..ce806c4e6c 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -4,24 +4,24 @@ Currently we build two main images: -- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). -- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). +- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). +- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. And additional intermediate image: - [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools. -## Building pipeline +## Build pipeline We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs -1. `neondatabase/compute-tools` and `neondatabase/compute-node` +1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14) 2. `neondatabase/neon` ## Docker Compose example -You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers. +You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following containers. - pageserver x 1 - safekeeper x 3 @@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea 1. create containers You can specify version of neon cluster using following environment values. -- PG_VERSION: postgres version for compute (default is 14) -- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml) +- PG_VERSION: postgres version for compute (default is 16 as of this writing) +- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest' ``` $ cd docker-compose/ -$ docker-compose down # remove the conainers if exists -$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version +$ docker-compose down # remove the containers if exists +$ PG_VERSION=16 TAG=latest docker-compose up --build -d # You can specify the postgres and image version Creating network "dockercompose_default" with the default driver Creating docker-compose_storage_broker_1 ... done (...omit...) @@ -47,29 +47,31 @@ Creating docker-compose_storage_broker_1 ... done 2. connect compute node ``` -$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass -$ chmod 600 ~/.pgpass -$ psql -h localhost -p 55433 -U cloud_admin +$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres +psql (16.3) +Type "help" for help. + postgres=# CREATE TABLE t(key int primary key, value text); CREATE TABLE -postgres=# insert into t values(1,1); +postgres=# insert into t values(1, 1); INSERT 0 1 postgres=# select * from t; - key | value + key | value -----+------- 1 | 1 (1 row) + ``` 3. If you want to see the log, you can use `docker-compose logs` command. ``` # check the container name you want to see $ docker ps -CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES -d6968a5ae912 dockercompose_compute "/shell/compute.sh" 5 minutes ago Up 5 minutes 0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp dockercompose_compute_1 +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +3582f6d76227 docker-compose_compute "/shell/compute.sh" 2 minutes ago Up 2 minutes 0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp docker-compose_compute_1 (...omit...) -$ docker logs -f dockercompose_compute_1 +$ docker logs -f docker-compose_compute_1 2022-10-21 06:15:48.757 GMT [56] LOG: connection authorized: user=cloud_admin database=postgres application_name=psql 2022-10-21 06:17:00.307 GMT [56] LOG: [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400' (...omit...) diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md index ba5d3c423e..11d984eb08 100644 --- a/docs/pageserver-services.md +++ b/docs/pageserver-services.md @@ -101,11 +101,12 @@ or ```toml [remote_storage] container_name = 'some-container-name' +storage_account = 'somestorageaccnt' container_region = 'us-east' prefix_in_container = '/test-prefix/' ``` -`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed. +The `AZURE_STORAGE_ACCESS_KEY` env variable can be used to specify the azure credentials if needed. ## Repository background tasks diff --git a/docs/pageserver-storage.md b/docs/pageserver-storage.md index 77e7ff35bc..9902f6b930 100644 --- a/docs/pageserver-storage.md +++ b/docs/pageserver-storage.md @@ -64,7 +64,7 @@ Storage. The LayerMap tracks what layers exist in a timeline. -Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or +Currently, the layer map is just a resizable array (Vec). On a GetPage@LSN or other read request, the layer map scans through the array to find the right layer that contains the data for the requested page. The read-code in LayeredTimeline is aware of the ancestor, and returns data from the ancestor timeline if it's diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md index c911d2c53d..5d862415eb 100644 --- a/docs/pageserver-thread-mgmt.md +++ b/docs/pageserver-thread-mgmt.md @@ -22,7 +22,7 @@ timeline to shutdown. It will also wait for them to finish. A task registered in the task registry can check if it has been requested to shut down, by calling `is_shutdown_requested()`. There's -also a `shudown_watcher()` Future that can be used with `tokio::select!` +also a `shutdown_watcher()` Future that can be used with `tokio::select!` or similar, to wake up on shutdown. diff --git a/docs/pageserver-walredo.md b/docs/pageserver-walredo.md index 1de9c177cc..7b366ff616 100644 --- a/docs/pageserver-walredo.md +++ b/docs/pageserver-walredo.md @@ -74,4 +74,4 @@ somewhat wasteful, but because most WAL records only affect one page, the overhead is acceptable. The WAL redo always happens for one particular page. If the WAL record -coantains changes to other pages, they are ignored. +contains changes to other pages, they are ignored. diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md index f99683cf09..d11b750e73 100644 --- a/docs/rfcs/002-storage.md +++ b/docs/rfcs/002-storage.md @@ -1,4 +1,4 @@ -# Zenith storage node — alternative +# Neon storage node — alternative ## **Design considerations** diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md index 1a549c2df5..003a05bd16 100644 --- a/docs/rfcs/003-laptop-cli.md +++ b/docs/rfcs/003-laptop-cli.md @@ -1,6 +1,6 @@ # Command line interface (end-user) -Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start. +Neon CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside neon distribution at least at the start. This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. @@ -8,40 +8,40 @@ The most important concept here is a snapshot, which can be created/pushed/pulle # Possible usage scenarios -## Install zenith, run a postgres +## Install neon, run a postgres ``` -> brew install pg-zenith -> zenith pg create # creates pgdata with default pattern pgdata$i -> zenith pg list +> brew install pg-neon +> neon pg create # creates pgdata with default pattern pgdata$i +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 0G zenith-local localhost:5432 +primary1 pgdata1 0G neon-local localhost:5432 ``` -## Import standalone postgres to zenith +## Import standalone postgres to neon ``` -> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg +> neon snapshot import --from=basebackup://replication@localhost:5432/ oldpg [====================------------] 60% | 20MB/s -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - -> zenith pg create --snapshot oldpg +> neon pg create --snapshot oldpg Started postgres on localhost:5432 -> zenith pg list +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 5G zenith-local localhost:5432 +primary1 pgdata1 5G neon-local localhost:5432 -> zenith snapshot destroy oldpg +> neon snapshot destroy oldpg Ok ``` Also, we may start snapshot import implicitly by looking at snapshot schema ``` -> zenith pg create --snapshot basebackup://replication@localhost:5432/ +> neon pg create --snapshot basebackup://replication@localhost:5432/ Downloading snapshot... Done. Started postgres on localhost:5432 Destroying snapshot... Done. @@ -52,39 +52,39 @@ Destroying snapshot... Done. Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). ``` -> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies +> neon pg create --snapshot http://learn-postgres.com/movies_db.neon movies ``` ## Create snapshot and push it to the cloud ``` -> zenith snapshot create pgdata1@snap1 -> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1 +> neon snapshot create pgdata1@snap1 +> neon snapshot push --to ssh://stas@neon.tech pgdata1@snap1 ``` ## Rollback database to the snapshot -One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`. +One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `neon pg checkout`. ``` -> zenith pg list +> neon pg list ID PGDATA USED STORAGE ENDPOINT -primary1 pgdata1 5G zenith-local localhost:5432 +primary1 pgdata1 5G neon-local localhost:5432 -> zenith snapshot create pgdata1@snap1 +> neon snapshot create pgdata1@snap1 -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - pgdata1@CURRENT 6G - -> zenith pg checkout pgdata1@snap1 +> neon pg checkout pgdata1@snap1 Stopping postgres on pgdata1. Rolling back pgdata1@CURRENT to pgdata1@snap1. Starting postgres on pgdata1. -> zenith snapshot list +> neon snapshot list ID SIZE PARENT oldpg 5G - pgdata1@snap1 6G - @@ -99,7 +99,7 @@ Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). ``` -> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month +> neon pitr create --storage s3tank --ttl 30d --name pitr_last_month ``` Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. @@ -108,29 +108,29 @@ Resetting the database to some state in past would require creating a snapshot o ## storage -Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. +Storage is either neon pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. -**zenith storage attach** -t [native|s3] -c key=value -n name +**neon storage attach** -t [native|s3] -c key=value -n name -Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'. +Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=neon.tech/stas/mystore. Other possible term for native is 'zstore'. -**zenith storage list** +**neon storage list** Show currently attached storages. For example: ``` -> zenith storage list +> neon storage list NAME USED TYPE OPTIONS PATH -local 5.1G zenith-local /opt/zenith/store/local -local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr -zcloud 60G zenith-remote zenith.tech/stas/mystore +local 5.1G neon-local /opt/neon/store/local +local.compr 20.4G neon-local compression=on /opt/neon/store/local.compr +zcloud 60G neon-remote neon.tech/stas/mystore s3tank 80G S3 ``` -**zenith storage detach** +**neon storage detach** -**zenith storage show** +**neon storage show** @@ -140,29 +140,29 @@ Manages postgres data directories and can start postgres instances with proper c Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together. -**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata +**neon pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. --no-start: just init datadir without creating ---snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1) +--snapshot snap: init from the snapshot. Snap is a name or URL (neon.tech/stas/mystore/snap1) --cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) -**zenith pg destroy** +**neon pg destroy** -**zenith pg start** [--replica] pgdata +**neon pg start** [--replica] pgdata Start postgres with proper extensions preloaded/installed. -**zenith pg checkout** +**neon pg checkout** Rollback data directory to some previous snapshot. -**zenith pg stop** pg_id +**neon pg stop** pg_id -**zenith pg list** +**neon pg list** ``` ROLE PGDATA USED STORAGE ENDPOINT @@ -173,7 +173,7 @@ primary my_pg2 3.2G local.compr localhost:5435 - my_pg3 9.2G local.compr - ``` -**zenith pg show** +**neon pg show** ``` my_pg: @@ -194,7 +194,7 @@ my_pg: ``` -**zenith pg start-rest/graphql** pgdata +**neon pg start-rest/graphql** pgdata Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. @@ -203,35 +203,35 @@ Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. -**zenith snapshot create** pgdata_name@snap_name +**neon snapshot create** pgdata_name@snap_name Creates a new snapshot in the same storage where pgdata_name exists. -**zenith snapshot push** --to url pgdata_name@snap_name +**neon snapshot push** --to url pgdata_name@snap_name -Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go. +Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `neon snapshot recv` before push happens. If url has some special schema like neon:// receiving side may require auth start `neon snapshot recv` on the go. -**zenith snapshot recv** +**neon snapshot recv** Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. -**zenith snapshot pull** --from url or path +**neon snapshot pull** --from url or path -Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format. +Connects to a remote neon/s3/file and pulls snapshot. The remote site should be neon service or files in our format. -**zenith snapshot import** --from basebackup://<...> or path +**neon snapshot import** --from basebackup://<...> or path Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. -**zenith snapshot export** +**neon snapshot export** -Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay). +Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be neon own format which is handy for us (but I think just tar of basebackup would be okay). -**zenith snapshot diff** snap1 snap2 +**neon snapshot diff** snap1 snap2 Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. -**zenith snapshot destroy** +**neon snapshot destroy** ## pitr @@ -239,7 +239,7 @@ Pitr represents wal stream and ttl policy for that stream XXX: any suggestions on a better name? -**zenith pitr create** name +**neon pitr create** name --ttl = inf | period @@ -247,21 +247,21 @@ XXX: any suggestions on a better name? --storage = storage_name -**zenith pitr extract-snapshot** pitr_name --lsn xxx +**neon pitr extract-snapshot** pitr_name --lsn xxx Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) -**zenith pitr gc** pitr_name +**neon pitr gc** pitr_name Force garbage collection on some PITR area. -**zenith pitr list** +**neon pitr list** -**zenith pitr destroy** +**neon pitr destroy** ## console -**zenith console** +**neon console** Opens browser targeted at web console with the more or less same functionality as described here. diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md index d4716156d1..6b83c77403 100644 --- a/docs/rfcs/004-durability.md +++ b/docs/rfcs/004-durability.md @@ -6,7 +6,7 @@ When do we consider the WAL record as durable, so that we can acknowledge the commit to the client and be reasonably certain that we will not lose the transaction? -Zenith uses a group of WAL safekeeper nodes to hold the generated WAL. +Neon uses a group of WAL safekeeper nodes to hold the generated WAL. A WAL record is considered durable, when it has been written to a majority of WAL safekeeper nodes. In this document, I use 5 safekeepers, because I have five fingers. A WAL record is durable, diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md index e36d0a9ae3..6c283d7a37 100644 --- a/docs/rfcs/005-zenith_local.md +++ b/docs/rfcs/005-zenith_local.md @@ -1,23 +1,23 @@ -# Zenith local +# Neon local -Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome. +Here I list some objectives to keep in mind when discussing neon-local design and a proposal that brings all components together. Your comments on both parts are very welcome. #### Why do we need it? - For distribution - this easy to use binary will help us to build adoption among developers. - For internal use - to test all components together. -In my understanding, we consider it to be just a mock-up version of zenith-cloud. +In my understanding, we consider it to be just a mock-up version of neon-cloud. > Question: How much should we care about durability and security issues for a local setup? #### Why is it better than a simple local postgres? -- Easy one-line setup. As simple as `cargo install zenith && zenith start` +- Easy one-line setup. As simple as `cargo install neon && neon start` - Quick and cheap creation of compute nodes over the same storage. > Question: How can we describe a use-case for this feature? -- Zenith-local can work with S3 directly. +- Neon-local can work with S3 directly. - Push and pull images (snapshots) to remote S3 to exchange data with other users. @@ -31,50 +31,50 @@ Ideally, just one binary that incorporates all elements we need. #### Components: -- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. -CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md -WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli +- **neon-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way. +CLI proposal is here https://github.com/neondatabase/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md +WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src/bin/cli -- **zenith-console** - WEB UI with same functionality as CLI. +- **neon-console** - WEB UI with same functionality as CLI. >Note: not for the first release. -- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. - > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local. +- **neon-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. + > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping neon-local. -- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). +- **neon-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). > Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? -WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src +WIP code is here: https://github.com/neondatabase/postgres/tree/main/pageserver/src -- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith. +- **neon-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to neon. > Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? > Question: Do we use it together with local page store or they are interchangeable? WIP code is ??? -- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. +- **neon-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. > Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. -WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper +WIP code is here: https://github.com/neondatabase/postgres/tree/main/src/bin/safekeeper -- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. +- **neon-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. - WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node + WIP code is in main branch and here: https://github.com/neondatabase/postgres/commits/compute_node #### REST API: Service endpoint: `http://localhost:3000` Resources: -- /storages - Where data lives: zenith-pageserver or zenith-s3 -- /pgs - Postgres - zenith-computenode +- /storages - Where data lives: neon-pageserver or neon-s3 +- /pgs - Postgres - neon-computenode - /snapshots - snapshots **TODO** ->Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? +>Question: Do we want to extend this API to manage neon components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? Methods and their mapping to CLI: -- /storages - zenith-pageserver or zenith-s3 +- /storages - neon-pageserver or neon-s3 CLI | REST API ------------- | ------------- @@ -84,7 +84,7 @@ storage list | GET /storages storage show -n name | GET /storages/:storage_name -- /pgs - zenith-computenode +- /pgs - neon-computenode CLI | REST API ------------- | ------------- diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md index 84dc932211..5030ecc7e7 100644 --- a/docs/rfcs/006-laptop-cli-v2-CLI.md +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -1,45 +1,45 @@ -Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". +Neon CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". # CLI v2 (after chatting with Carl) -Zenith introduces the notion of a repository. +Neon introduces the notion of a repository. ```bash -zenith init -zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory +neon init +neon clone neon://neon.tech/piedpiper/northwind -- clones a repo to the northwind directory ``` Once you have a cluster catalog you can explore it ```bash -zenith log -- returns a list of commits -zenith status -- returns if there are changes in the catalog that can be committed -zenith commit -- commits the changes and generates a new commit hash -zenith branch experimental -- creates a branch called testdb based on a given commit hash +neon log -- returns a list of commits +neon status -- returns if there are changes in the catalog that can be committed +neon commit -- commits the changes and generates a new commit hash +neon branch experimental -- creates a branch called testdb based on a given commit hash ``` To make changes in the catalog you need to run compute nodes ```bash -- here is how you a compute node -zenith start /home/pipedpiper/northwind:main -- starts a compute instance -zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud +neon start /home/pipedpiper/northwind:main -- starts a compute instance +neon start neon://neon.tech/northwind:main -- starts a compute instance in the cloud -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) +neon start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port) -- you can start a compute node against any hash or branch -zenith start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) +neon start /home/pipedpiper/northwind: --port 8009 -- start another compute instance (on different port) -- After running some DML you can run --- zenith status and see how there are two WAL streams one on top of +-- neon status and see how there are two WAL streams one on top of -- the main branch -zenith status +neon status -- and another on top of the experimental branch -zenith status -b experimental +neon status -b experimental -- you can commit each branch separately -zenith commit main +neon commit main -- or -zenith commit -c /home/pipedpiper/northwind:experimental +neon commit -c /home/pipedpiper/northwind:experimental ``` Starting compute instances against cloud environments @@ -47,18 +47,18 @@ Starting compute instances against cloud environments ```bash -- you can start a compute instance against the cloud environment -- in this case all of the changes will be streamed into the cloud -zenith start https://zenith:tech/pipedpiper/northwind:main -zenith start https://zenith:tech/pipedpiper/northwind:main -zenith status -c https://zenith:tech/pipedpiper/northwind:main -zenith commit -c https://zenith:tech/pipedpiper/northwind:main -zenith branch -c https://zenith:tech/pipedpiper/northwind: experimental +neon start https://neon:tecj/pipedpiper/northwind:main +neon start https://neon:tecj/pipedpiper/northwind:main +neon status -c https://neon:tecj/pipedpiper/northwind:main +neon commit -c https://neon:tecj/pipedpiper/northwind:main +neon branch -c https://neon:tecj/pipedpiper/northwind: experimental ``` Pushing data into the cloud ```bash -- pull all the commits from the cloud -zenith pull +neon pull -- push all the commits to the cloud -zenith push +neon push ``` diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md index e6e6e172ad..749a940313 100644 --- a/docs/rfcs/006-laptop-cli-v2-repository-structure.md +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -1,14 +1,14 @@ # Repository format -A Zenith repository is similar to a traditional PostgreSQL backup +A Neon repository is similar to a traditional PostgreSQL backup archive, like a WAL-G bucket or pgbarman backup catalogue. It holds multiple versions of a PostgreSQL database cluster. -The distinguishing feature is that you can launch a Zenith Postgres +The distinguishing feature is that you can launch a Neon Postgres server directly against a branch in the repository, without having to -"restore" it first. Also, Zenith manages the storage automatically, +"restore" it first. Also, Neon manages the storage automatically, there is no separation between full and incremental backups nor WAL -archive. Zenith relies heavily on the WAL, and uses concepts similar +archive. Neon relies heavily on the WAL, and uses concepts similar to incremental backups and WAL archiving internally, but it is hidden from the user. @@ -19,15 +19,15 @@ efficient. Just something to get us started. The repository directory looks like this: - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// - .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// + .neon/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history - .zenith/refs/branches/mybranch - .zenith/refs/tags/foo - .zenith/refs/tags/bar + .neon/refs/branches/mybranch + .neon/refs/tags/foo + .neon/refs/tags/bar - .zenith/datadirs/ + .neon/datadirs/ ### Timelines @@ -39,7 +39,7 @@ All WAL is generated on a timeline. You can launch a read-only node against a tag or arbitrary LSN on a timeline, but in order to write, you need to create a timeline. -Each timeline is stored in a directory under .zenith/timelines. It +Each timeline is stored in a directory under .neon/timelines. It consists of a WAL archive, containing all the WAL in the standard PostgreSQL format, under the wal/ subdirectory. @@ -66,18 +66,18 @@ contains the UUID of the timeline (and LSN, for tags). ### Datadirs -.zenith/datadirs contains PostgreSQL data directories. You can launch +.neon/datadirs contains PostgreSQL data directories. You can launch a Postgres instance on one of them with: ``` - postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c + postgres -D .neon/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c ``` All the actual data is kept in the timeline directories, under -.zenith/timelines. The data directories are only needed for active +.neon/timelines. The data directories are only needed for active PostgreQSL instances. After an instance is stopped, the data directory -can be safely removed. "zenith start" will recreate it quickly from -the data in .zenith/timelines, if it's missing. +can be safely removed. "neon start" will recreate it quickly from +the data in .neon/timelines, if it's missing. ## Version 2 @@ -103,14 +103,14 @@ more advanced. The exact format is TODO. But it should support: ### Garbage collection -When you run "zenith gc", old timelines that are no longer needed are +When you run "neon gc", old timelines that are no longer needed are removed. That involves collecting the list of "unreachable" objects, starting from the named branches and tags. Also, if enough WAL has been generated on a timeline since last snapshot, a new snapshot or delta is created. -### zenith push/pull +### neon push/pull Compare the tags and branches on both servers, and copy missing ones. For each branch, compare the timeline it points to in both servers. If @@ -123,7 +123,7 @@ every time you start up an instance? Then you would detect that the timelines have diverged. That would match with the "epoch" concept that we have in the WAL safekeeper -### zenith checkout/commit +### neon checkout/commit In this format, there is no concept of a "working tree", and hence no concept of checking out or committing. All modifications are done on @@ -134,7 +134,7 @@ You can easily fork off a temporary timeline to emulate a "working tree". You can later remove it and have it garbage collected, or to "commit", re-point the branch to the new timeline. -If we want to have a worktree and "zenith checkout/commit" concept, we can +If we want to have a worktree and "neon checkout/commit" concept, we can emulate that with a temporary timeline. Create the temporary timeline at -"zenith checkout", and have "zenith commit" modify the branch to point to +"neon checkout", and have "neon commit" modify the branch to point to the new timeline. diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md index e6355f4a03..96f117bfe9 100644 --- a/docs/rfcs/007-serverless-on-laptop.md +++ b/docs/rfcs/007-serverless-on-laptop.md @@ -4,27 +4,27 @@ How it works now 1. Create repository, start page server on it ``` -$ zenith init +$ neon init ... created main branch -new zenith repository was created in .zenith +new neon repository was created in .neon -$ zenith pageserver start -Starting pageserver at '127.0.0.1:64000' in .zenith +$ neon pageserver start +Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create a branch, and start a Postgres instance on it ``` -$ zenith branch heikki main +$ neon branch heikki main branching at end of WAL: 0/15ECF68 -$ zenith pg create heikki +$ neon pg create heikki Initializing Postgres on timeline 76cf9279915be7797095241638e64644... -Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432 +Extracting base backup to create postgres instance: path=.neon/pgdatadirs/pg1 port=55432 -$ zenith pg start pg1 +$ neon pg start pg1 Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' waiting for server to start.... done server started @@ -52,20 +52,20 @@ serverless on your laptop, so that the workflow becomes just: 1. Create repository, start page server on it (same as before) ``` -$ zenith init +$ neon init ... created main branch -new zenith repository was created in .zenith +new neon repository was created in .neon -$ zenith pageserver start -Starting pageserver at '127.0.0.1:64000' in .zenith +$ neon pageserver start +Starting pageserver at '127.0.0.1:64000' in .neon Page server started ``` 2. Create branch ``` -$ zenith branch heikki main +$ neon branch heikki main branching at end of WAL: 0/15ECF68 ``` diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md index 272628e1ce..a36932222a 100644 --- a/docs/rfcs/008-push-pull.md +++ b/docs/rfcs/008-push-pull.md @@ -7,22 +7,22 @@ Here is a proposal about implementing push/pull mechanics between pageservers. W The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). ``` -zenith origin add -zenith origin list -zenith origin remove +neon origin add +neon origin list +neon origin remove ``` Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. -Behind the scenes, this commands may update toml file inside .zenith directory. +Behind the scenes, this commands may update toml file inside .neon directory. ## Push ### Pushing branch ``` -zenith push mybranch cloudserver # push to eponymous branch in cloudserver -zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver +neon push mybranch cloudserver # push to eponymous branch in cloudserver +neon push mybranch cloudserver:otherbranch # push to a different branch in cloudserver ``` Exact mechanics would be slightly different in the following situations: diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md index 0acbd68f86..bbd0f75fe2 100644 --- a/docs/rfcs/009-snapshot-first-storage-cli.md +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -2,7 +2,7 @@ While working on export/import commands, I understood that they fit really well We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. -Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith. +Even if neon aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to neon. So here is an attempt to design consistent CLI for different usage scenarios: @@ -16,8 +16,8 @@ Save`storage_dest` and other parameters in config. Push snapshots to `storage_dest` in background. ``` -zenith init --storage_dest=S3_PREFIX -zenith start +neon init --storage_dest=S3_PREFIX +neon start ``` #### 2. Restart pageserver (manually or crash-recovery). @@ -25,7 +25,7 @@ Take `storage_dest` from pageserver config, start pageserver from latest snapsho Push snapshots to `storage_dest` in background. ``` -zenith start +neon start ``` #### 3. Import. @@ -35,22 +35,22 @@ Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time Save`storage_dest` parameters in config. Push snapshots to `storage_dest` in background. ``` -//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. -zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX -zenith start +//I.e. we want to start neon on top of existing $PGDATA and use s3 as a persistent storage. +neon init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX +neon start ``` How to pass credentials needed for `snapshot_path`? #### 4. Export. Manually push snapshot to `snapshot_path` which differs from `storage_dest` -Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. +Optionally set `snapshot_format`, which can be plain pgdata format or neon format. ``` -zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata +neon export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata ``` #### Notes and questions - safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? -- Why do we need `zenith init` as a separate command? Can't we init everything at first start? +- Why do we need `neon init` as a separate command? Can't we init everything at first start? - We can think of better names for all options. - Export to plain postgres format will be useless, if we are not 100% compatible on page level. I can recall at least one such difference - PD_WAL_LOGGED flag in pages. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md index 7e815abf73..2f3ccbc09b 100644 --- a/docs/rfcs/013-term-history.md +++ b/docs/rfcs/013-term-history.md @@ -9,7 +9,7 @@ receival and this might lag behind `term`; safekeeper switches to epoch `n` when it has received all committed log records from all `< n` terms. This roughly corresponds to proposed in -https://github.com/zenithdb/rfcs/pull/3/files +https://github.com/neondatabase/rfcs/pull/3/files This makes our biggest our difference from Raft. In Raft, every log record is diff --git a/docs/rfcs/014-safekeepers-gossip.md b/docs/rfcs/014-safekeepers-gossip.md index 3d6cc04b94..ff38a0a0ef 100644 --- a/docs/rfcs/014-safekeepers-gossip.md +++ b/docs/rfcs/014-safekeepers-gossip.md @@ -1,6 +1,6 @@ # Safekeeper gossip -Extracted from this [PR](https://github.com/zenithdb/rfcs/pull/13) +Extracted from this [PR](https://github.com/neondatabase/rfcs/pull/13) ## Motivation diff --git a/docs/rfcs/015-storage-messaging.md b/docs/rfcs/015-storage-messaging.md index a415b90459..7702311d65 100644 --- a/docs/rfcs/015-storage-messaging.md +++ b/docs/rfcs/015-storage-messaging.md @@ -2,7 +2,7 @@ Created on 19.01.22 -Initially created [here](https://github.com/zenithdb/rfcs/pull/16) by @kelvich. +Initially created [here](https://github.com/neondatabase/rfcs/pull/16) by @kelvich. That it is an alternative to (014-safekeeper-gossip)[] @@ -292,4 +292,4 @@ But with an etcd we are in a bit different situation: 1. We don't need persistency and strong consistency guarantees for the data we store in the etcd 2. etcd uses Grpc as a protocol, and messages are pretty simple -So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local zenith installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). +So it looks like implementing in-mem store with etcd interface is straightforward thing _if we will want that in future_. At the same time, we can avoid implementing it right now, and we will be able to run local neon installation with etcd running somewhere in the background (as opposed to building and running console, which in turn requires Postgres). diff --git a/docs/rfcs/017-console-split.md b/docs/rfcs/017-console-split.md new file mode 100644 index 0000000000..8036920610 --- /dev/null +++ b/docs/rfcs/017-console-split.md @@ -0,0 +1,420 @@ +# Splitting cloud console + +Created on 17.06.2022 + +## Summary + +Currently we have `cloud` repository that contains code implementing public API for our clients as well as code for managing storage and internal infrastructure services. We can split everything user-related from everything storage-related to make it easier to test and maintain. + +This RFC proposes to introduce a new control-plane service with HTTP API. The overall architecture will look like this: + +```markup +. x + external area x internal area + (our clients) x (our services) + x + x ┌───────────────────────┐ + x ┌───────────────┐ > ┌─────────────────────┐ │ Storage (EC2) │ + x │ console db │ > │ control-plane db │ │ │ + x └───────────────┘ > └─────────────────────┘ │ - safekeepers │ + x ▲ > ▲ │ - pageservers │ + x │ > │ │ │ +┌──────────────────┐ x ┌───────┴───────┐ > │ │ Dependencies │ +│ browser UI ├──►│ │ > ┌──────────┴──────────┐ │ │ +└──────────────────┘ x │ │ > │ │ │ - etcd │ + x │ console ├───────►│ control-plane ├────►│ - S3 │ +┌──────────────────┐ x │ │ > │ (deployed in k8s) │ │ - more? │ +│public API clients├──►│ │ > │ │ │ │ +└──────────────────┘ x └───────┬───────┘ > └──────────┬──────────┘ └───────────────────────┘ + x │ > ▲ │ ▲ + x │ > │ │ │ + x ┌───────┴───────┐ > │ │ ┌───────────┴───────────┐ + x │ dependencies │ > │ │ │ │ + x │- analytics │ > │ └───────────────►│ computes │ + x │- auth │ > │ │ (deployed in k8s) │ + x │- billing │ > │ │ │ + x └───────────────┘ > │ └───────────────────────┘ + x > │ ▲ + x > ┌─────┴───────────────┐ │ +┌──────────────────┐ x > │ │ │ +│ │ x > │ proxy ├─────────────────┘ +│ postgres ├───────────────────────────►│ (deployed in k8s) │ +│ users │ x > │ │ +│ │ x > └─────────────────────┘ +└──────────────────┘ x > + > + > + closed-source > open-source + > + > +``` + +Notes: + +- diagram is simplified in the less-important places +- directed arrows are strict and mean that connections in the reverse direction are forbidden + +This split is quite complex and this RFC proposes several smaller steps to achieve the larger goal: + +1. Start by refactoring the console code, the goal is to have console and control-plane code in the different directories without dependencies on each other. +2. Do similar refactoring for tables in the console database, remove queries selecting data from both console and control-plane; move control-plane tables to a separate database. +3. Implement control-plane HTTP API serving on a separate TCP port; make all console→control-plane calls to go through that HTTP API. +4. Move control-plane source code to the neon repo; start control-plane as a separate service. + +## Motivation + +These are the two most important problems we want to solve: + +- Publish open-source implementation of all our cloud/storage features +- Make a unified control-plane that is used in all cloud (serverless) and local (tests) setups + +Right now we have some closed-source code in the cloud repo. That code contains implementation for running Neon computes in k8s and without that code it’s impossible to automatically scale PostgreSQL computes. That means that we don’t have an open-source serverless PostgreSQL at the moment. + +After splitting and open-sourcing control-plane service we will have source code and Docker images for all storage services. That control-plane service should have HTTP API for creating and managing tenants (including all our storage features), while proxy will listen for incoming connections and create computes on-demand. + +Improving our test suite is an important task, but requires a lot of prerequisites and may require a separate RFC. Possible implementation of that is described in the section [Next steps](#next-steps). + +Another piece of motivation can be a better involvement of storage development team into a control-plane. By splitting control-plane from the console, it can be more convenient to test and develop control-plane with paying less attention to “business” features, such as user management, billing and analytics. + +For example, console currently requires authentication providers such as GitHub OAuth to work at all, as well as nodejs to be able to build it locally. It will be more convenient to build and run it locally without these requirements. + +## Proposed implementation + +### Current state of things + +Let’s start with defining the current state of things at the moment of this proposal. We have three repositories containing source code: + +- open-source `postgres` — our fork of postgres +- open-source `neon` — our main repository for storage source code +- closed-source `cloud` — mostly console backend and UI frontend + +This proposal aims not to change anything at the existing code in `neon` and `postgres` repositories, but to create control-plane service and move it’s source code from `cloud` to the `neon` repository. That means that we need to split code in `cloud` repo only, and will consider only this repository for exploring its source code. + +Let’s look at the miscellaneous things in the `cloud` repo which are NOT part of the console application, i.e. NOT the Go source code that is compiled to the `./console` binary. There we have: + +- command-line tools, such as cloudbench, neonadmin +- markdown documentation +- cloud operations scripts (helm, terraform, ansible) +- configs and other things +- e2e python tests +- incidents playbooks +- UI frontend +- Make build scripts, code generation scripts +- database migrations +- swagger definitions + +And also let’s take a look at what we have in the console source code, which is the service we’d like to split: + +- API Servers + - Public API v2 + - Management API v2 + - Public API v1 + - Admin API v1 (same port as Public API v1) + - Management API v1 +- Workers + - Monitor Compute Activity + - Watch Failed Operations + - Availability Checker + - Business Metrics Collector +- Internal Services + - Auth Middleware, UserIsAdmin, Cookies + - Cable Websocket Server + - Admin Services + - Global Settings, Operations, Pageservers, Platforms, Projects, Safekeepers, Users + - Authenticate Proxy + - API Keys + - App Controller, serving UI HTML + - Auth Controller + - Branches + - Projects + - Psql Connect + Passwordless login + - Users + - Cloud Metrics + - User Metrics + - Invites + - Pageserver/Safekeeper management + - Operations, k8s/docker/common logic + - Platforms, Regions + - Project State + - Projects Roles, SCRAM + - Global Settings +- Other things + - segment analytics integration + - sentry integration + - other common utilities packages + +### Drawing the splitting line + +The most challenging and the most important thing is to define the line that will split new control-plane service from the existing cloud service. If we don’t get it right, then we can end up with having a lot more issues without many benefits. + +We propose to define that line as follows: + +- everything user-related stays in the console service +- everything storage-related should be in the control-plane service +- something that falls in between should be decided where to go, but most likely should stay in the console service +- some similar parts should be in both services, such as admin/management/db_migrations + +We call user-related all requests that can be connected to some user. The general idea is don’t have any user_id in the control-plane service and operate exclusively on tenant_id+timeline_id, the same way as existing storage services work now (compute, safekeeper, pageserver). + +Storage-related things can be defined as doing any of the following: + +- using k8s API +- doing requests to any of the storage services (proxy, compute, safekeeper, pageserver, etc..) +- tracking current status of tenants/timelines, managing lifetime of computes + +Based on that idea, we can say that new control-plane service should have the following components: + +- single HTTP API for everything + - Create and manage tenants and timelines + - Manage global settings and storage configuration (regions, platforms, safekeepers, pageservers) + - Admin API for storage health inspection and debugging +- Workers + - Monitor Compute Activity + - Watch Failed Operations + - Availability Checker +- Internal Services + - Admin Services + - Global Settings, Operations, Pageservers, Platforms, Tenants, Safekeepers + - Authenticate Proxy + - Branches + - Psql Connect + - Cloud Metrics + - Pageserver/Safekeeper management + - Operations, k8s/docker/common logic + - Platforms, Regions + - Tenant State + - Compute Roles, SCRAM + - Global Settings + +--- + +And other components should probably stay in the console service: + +- API Servers (no changes here) + - Public API v2 + - Management API v2 + - Public API v1 + - Admin API v1 (same port as Public API v1) + - Management API v1 +- Workers + - Business Metrics Collector +- Internal Services + - Auth Middleware, UserIsAdmin, Cookies + - Cable Websocket Server + - Admin Services + - Users admin stays the same + - Other admin services can redirect requests to the control-plane + - API Keys + - App Controller, serving UI HTML + - Auth Controller + - Projects + - User Metrics + - Invites + - Users + - Passwordless login +- Other things + - segment analytics integration + - sentry integration + - other common utilities packages + +There are also miscellaneous things that are useful for all kinds of services. So we can say that these things can be in both services: + +- markdown documentation +- e2e python tests +- make build scripts, code generation scripts +- database migrations +- swagger definitions + +The single entrypoint to the storage should be control-plane API. After we define that API, we can have code-generated implementation for the client and for the server. The general idea is to move code implementing storage components from the console to the API implementation inside the new control-plane service. + +After the code is moved to the new service, we can fill the created void by making API calls to the new service: + +- authorization of the client +- mapping user_id + project_id to the tenant_id +- calling the control-plane API + +### control-plane API + +Currently we have the following projects API in the console: + +``` +GET /projects/{project_id} +PATCH /projects/{project_id} +POST /projects/{project_id}/branches +GET /projects/{project_id}/databases +POST /projects/{project_id}/databases +GET /projects/{project_id}/databases/{database_id} +PUT /projects/{project_id}/databases/{database_id} +DELETE /projects/{project_id}/databases/{database_id} +POST /projects/{project_id}/delete +GET /projects/{project_id}/issue_token +GET /projects/{project_id}/operations +GET /projects/{project_id}/operations/{operation_id} +POST /projects/{project_id}/query +GET /projects/{project_id}/roles +POST /projects/{project_id}/roles +GET /projects/{project_id}/roles/{role_name} +DELETE /projects/{project_id}/roles/{role_name} +POST /projects/{project_id}/roles/{role_name}/reset_password +POST /projects/{project_id}/start +POST /projects/{project_id}/stop +POST /psql_session/{psql_session_id} +``` + +It looks fine and we probably already have clients relying on it. So we should not change it, at least for now. But most of these endpoints (if not all) are related to storage, and it can suggest us what control-plane API should look like: + +``` +GET /tenants/{tenant_id} +PATCH /tenants/{tenant_id} +POST /tenants/{tenant_id}/branches +GET /tenants/{tenant_id}/databases +POST /tenants/{tenant_id}/databases +GET /tenants/{tenant_id}/databases/{database_id} +PUT /tenants/{tenant_id}/databases/{database_id} +DELETE /tenants/{tenant_id}/databases/{database_id} +POST /tenants/{tenant_id}/delete +GET /tenants/{tenant_id}/issue_token +GET /tenants/{tenant_id}/operations +GET /tenants/{tenant_id}/operations/{operation_id} +POST /tenants/{tenant_id}/query +GET /tenants/{tenant_id}/roles +POST /tenants/{tenant_id}/roles +GET /tenants/{tenant_id}/roles/{role_name} +DELETE /tenants/{tenant_id}/roles/{role_name} +POST /tenants/{tenant_id}/roles/{role_name}/reset_password +POST /tenants/{tenant_id}/start +POST /tenants/{tenant_id}/stop +POST /psql_session/{psql_session_id} +``` + +One of the options here is to use gRPC instead of the HTTP, which has some useful features, but there are some strong points towards using plain HTTP: + +- HTTP API is easier to use for the clients +- we already have HTTP API in pageserver/safekeeper/console +- we probably want control-plane API to be similar to the console API, available in the cloud + +### Getting updates from the storage + +There can be some valid cases, when we would like to know what is changed in the storage. For example, console might want to know when user has queried and started compute and when compute was scaled to zero after that, to know how much user should pay for the service. Another example is to get info about reaching the disk space limits. Yet another example is to do analytics, such as how many users had at least one active project in a month. + +All of the above cases can happen without using the console, just by accessing compute through the proxy. + +To solve this, we can have a log of events occurring in the storage (event logs). That is very similar to operations table we have right now, the only difference is that events are immutable and we cannot change them after saving to the database. For example, we might want to have events for the following activities: + +- We finished processing some HTTP API query, such as resetting the password +- We changed some state, such as started or stopped a compute +- Operation is created +- Operation is started for the first time +- Operation is failed for the first time +- Operation is finished + +Once we save these events to the database, we can create HTTP API to subscribe to these events. That API can look like this: + +``` +GET /events/ + +{ + "events": [...], + "next_cursor": 123 +} +``` + +It should be possible to replay event logs from some point of time, to get a state of almost anything from the storage services. That means that if we maintain some state in the control-plane database and we have a reason to have the same state in the console database, it is possible by polling events from the control-plane API and changing the state in the console database according to the events. + +### Next steps + +After implementing control-plane HTTP API and starting control-plane as a separate service, we might want to think of exploiting benefits of the new architecture, such as reorganizing test infrastructure. Possible options are listed in the [Next steps](#next-steps-1). + +## Non Goals + +RFC doesn’t cover the actual cloud deployment scripts and schemas, such as terraform, ansible, k8s yaml’s and so on. + +## Impacted components + +Mostly console, but can also affect some storage service. + +## Scalability + +We should support starting several instances of the new control-plane service at the same time. + +At the same time, it should be possible to use only single instance of control-plane, which can be useful for local tests. + +## Security implications + +New control-plane service is an internal service, so no external requests can reach it. But at the same time, it contains API to do absolutely anything with any of the tenants. That means that bad internal actor can potentially read and write all of the tenants. To make this safer, we can have one of these: + +- Simple option is to protect all requests with a single private key, so that no one can make requests without having that one key. +- Another option is to have a separate token for every tenant and store these tokens in another secure place. This way it’s harder to access all tenants at once, because they have the different tokens. + +## Alternative implementation + +There was an idea to create a k8s operator for managing storage services and computes, but author of this RFC is not really familiar with it. + +Regarding less alternative ideas, there are another options for the name of the new control-plane service: + +- storage-ctl +- cloud +- cloud-ctl + +## Pros/cons of proposed approaches (TODO) + +Pros: + +- All storage features are completely open-source +- Better tests coverage, less difference between cloud and local setups +- Easier to develop storage and cloud features, because there is no need to setup console for that +- Easier to deploy storage-only services to the any cloud + +Cons: + +- All storage features are completely open-source +- Distributed services mean more code to connect different services and potential network issues +- Console needs to have a dependency on storage API, there can be complications with developing new feature in a branch +- More code to JOIN data from different services (console and control-plane) + +## Definition of Done + +We have a new control-plane service running in the k8s. Source code for that control-plane service is located in the open-source neon repo. + +## Next steps + +After we’ve reached DoD, we can make further improvements. + +First thing that can benefit from the split is local testing. The same control-plane service can implement starting computes as a local processes instead of k8s deployments. If it will also support starting pageservers/safekeepers/proxy for the local setup, then it can completely replace `./neon_local` binary, which is currently used for testing. The local testing environment can look like this: + +``` +┌─────────────────────┐ ┌───────────────────────┐ +│ │ │ Storage (local) │ +│ control-plane db │ │ │ +│ (local process) │ │ - safekeepers │ +│ │ │ - pageservers │ +└──────────▲──────────┘ │ │ + │ │ Dependencies │ +┌──────────┴──────────┐ │ │ +│ │ │ - etcd │ +│ control-plane ├────►│ - S3 │ +│ (local process) │ │ - more? │ +│ │ │ │ +└──────────┬──────────┘ └───────────────────────┘ + ▲ │ ▲ + │ │ │ + │ │ ┌───────────┴───────────┐ + │ │ │ │ + │ └───────────────►│ computes │ + │ │ (local processes) │ + │ │ │ +┌──────┴──────────────┐ └───────────────────────┘ +│ │ ▲ +│ proxy │ │ +│ (local process) ├─────────────────┘ +│ │ +└─────────────────────┘ +``` + +The key thing here is that control-plane local service have the same API and almost the same implementation as the one deployed in the k8s. This allows to run the same e2e tests against both cloud and local setups. + +For the python test_runner tests everything can stay mostly the same. To do that, we just need to replace `./neon_local` cli commands with API calls to the control-plane. + +The benefit here will be in having fast local tests that are really close to our cloud setup. Bugs in k8s queries are still cannot be found when running computes as a local processes, but it should be really easy to start k8s locally (for example in k3s) and run the same tests with control-plane connected to the local k8s. + +Talking about console and UI tests, after the split there should be a way to test these without spinning up all the storage locally. New control-plane service has a well-defined API, allowing us to mock it. This way we can create UI tests to verify the right calls are issued after specific UI interactions and verify that we render correct messages when API returns errors. \ No newline at end of file diff --git a/docs/rfcs/018-storage-messaging-2.md b/docs/rfcs/018-storage-messaging-2.md index 364f62dd2e..2419dd5fc5 100644 --- a/docs/rfcs/018-storage-messaging-2.md +++ b/docs/rfcs/018-storage-messaging-2.md @@ -78,7 +78,7 @@ with grpc streams and tokio mpsc channels. The implementation description is at It is just 500 lines of code and core functionality is complete. 1-1 pub sub gives about 120k received messages per second; having multiple subscribers in -different connecitons quickly scales to 1 million received messages per second. +different connections quickly scales to 1 million received messages per second. I had concerns about many concurrent streams in singe connection, but 2^20 subscribers still work (though eat memory, with 10 publishers 20GB are consumed; in this implementation each publisher holds full copy of all subscribers). There @@ -95,12 +95,12 @@ other members, with best-effort this is simple. ### Security implications Communication happens in a private network that is not exposed to users; -additionaly we can add auth to the broker. +additionally we can add auth to the broker. ## Alternative: get existing pub-sub We could take some existing pub sub solution, e.g. RabbitMQ, Redis. But in this -case IMV simplicity of our own outweights external dependency costs (RabbitMQ is +case IMV simplicity of our own outweighs external dependency costs (RabbitMQ is much more complicated and needs VM; Redis Rust client maintenance is not ideal...). Also note that projects like CockroachDB and TiDB are based on gRPC as well. diff --git a/docs/rfcs/019-tenant-timeline-lifecycles.md b/docs/rfcs/019-tenant-timeline-lifecycles.md index 2734bf17b9..558b5335e7 100644 --- a/docs/rfcs/019-tenant-timeline-lifecycles.md +++ b/docs/rfcs/019-tenant-timeline-lifecycles.md @@ -74,7 +74,7 @@ TenantMaintenanceGuard: Like ActiveTenantGuard, but can be held even when the tenant is not in Active state. Used for operations like attach/detach. Perhaps allow only one such guard on a Tenant at a time. -Similarly for Timelines. We don't currentl have a "state" on Timeline, but I think +Similarly for Timelines. We don't currently have a "state" on Timeline, but I think we need at least two states: Active and Stopping. The Stopping state is used at deletion, to prevent new TimelineActiveGuards from appearing, while you wait for existing TimelineActiveGuards to die out. @@ -85,7 +85,7 @@ have a TenantActiveGuard, and the tenant's state changes from Active to Stopping, the is_shutdown_requested() function should return true, and shutdown_watcher() future should return. -This signaling doesn't neessarily need to cover all cases. For example, if you +This signaling doesn't necessarily need to cover all cases. For example, if you have a block of code in spawn_blocking(), it might be acceptable if is_shutdown_requested() doesn't return true even though the tenant is in Stopping state, as long as the code finishes reasonably fast. diff --git a/docs/rfcs/020-pageserver-s3-coordination.md b/docs/rfcs/020-pageserver-s3-coordination.md index 5e2912ba99..90ba3a6f4d 100644 --- a/docs/rfcs/020-pageserver-s3-coordination.md +++ b/docs/rfcs/020-pageserver-s3-coordination.md @@ -37,7 +37,7 @@ sequenceDiagram ``` At this point it is not possible to restore from index, it contains L2 which -is no longer available in s3 and doesnt contain L3 added by compaction by the +is no longer available in s3 and doesn't contain L3 added by compaction by the first pageserver. So if any of the pageservers restart initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) @@ -74,7 +74,7 @@ One possible solution for relocation case is to orchestrate background jobs from outside. The oracle who runs migration can turn off background jobs on PS1 before migration and then run migration -> enable them on PS2. The problem comes if migration fails. In this case in order to resume background jobs -oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt +oracle needs to guarantee that PS2 doesn't run background jobs and if it doesn't respond then PS1 is stuck unable to run compaction/gc. This cannot be solved without human ensuring that no upload from PS2 can happen. In order to be able to resolve this automatically CAS is required on S3 side so pageserver can @@ -128,7 +128,7 @@ During discussion it seems that we converged on the approach consisting of: whether we need to apply change to the index state or not. - Responsibility for running background jobs is assigned externally. Pageserver keeps locally persistent flag for each tenant that indicates whether this - pageserver is considered as primary one or not. TODO what happends if we + pageserver is considered as primary one or not. TODO what happens if we crash and cannot start for some extended period of time? Control plane can assign ownership to some other pageserver. Pageserver needs some way to check if its still the blessed one. Maybe by explicit request to control plane on @@ -138,7 +138,7 @@ Requirement for deterministic layer generation was considered overly strict because of two reasons: - It can limit possible optimizations e g when pageserver wants to reshuffle - some data locally and doesnt want to coordinate this + some data locally and doesn't want to coordinate this - The deterministic algorithm itself can change so during deployments for some time there will be two different version running at the same time which can cause non determinism @@ -164,7 +164,7 @@ sequenceDiagram CP->>PS1: Yes deactivate CP PS1->>S3: Fetch PS1 index. - note over PS1: Continue operations, start backround jobs + note over PS1: Continue operations, start background jobs note over PS1,PS2: PS1 starts up and still and is not a leader anymore PS1->>CP: Am I still the leader for Tenant X? CP->>PS1: No @@ -203,7 +203,7 @@ sequenceDiagram ### Eviction When two pageservers operate on a tenant for extended period of time follower -doesnt perform write operations in s3. When layer is evicted follower relies +doesn't perform write operations in s3. When layer is evicted follower relies on updates from primary to get info about layers it needs to cover range for evicted layer. diff --git a/docs/rfcs/022-pageserver-delete-from-s3.md b/docs/rfcs/022-pageserver-delete-from-s3.md index 260e549670..c237a3edb8 100644 --- a/docs/rfcs/022-pageserver-delete-from-s3.md +++ b/docs/rfcs/022-pageserver-delete-from-s3.md @@ -4,7 +4,7 @@ Created on 08.03.23 ## Motivation -Currently we dont delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC). +Currently we don't delete pageserver part of the data from s3 when project is deleted. (The same is true for safekeepers, but this outside of the scope of this RFC). This RFC aims to spin a discussion to come to a robust deletion solution that wont put us in into a corner for features like postponed deletion (when we keep data for user to be able to restore a project if it was deleted by accident) @@ -75,9 +75,9 @@ Remote one is needed for cases when pageserver is lost during deletion so other Why local mark file is needed? -If we dont have one, we have two choices, delete local data before deleting the remote part or do that after. +If we don't have one, we have two choices, delete local data before deleting the remote part or do that after. -If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote conuterparts of locally available tenants). +If we delete local data before remote then during restart pageserver wont pick up remote tenant at all because nothing is available locally (pageserver looks for remote counterparts of locally available tenants). If we delete local data after remote then at the end of the sequence when remote mark file is deleted if pageserver restart happens then the state is the same to situation when pageserver just missing data on remote without knowing the fact that this data is intended to be deleted. In this case the current behavior is upload everything local-only to remote. @@ -145,7 +145,7 @@ sequenceDiagram CP->>PS: Retry delete tenant PS->>CP: Not modified else Mark is missing - note over PS: Continue to operate the tenant as if deletion didnt happen + note over PS: Continue to operate the tenant as if deletion didn't happen note over CP: Eventually console should
retry delete request @@ -168,7 +168,7 @@ sequenceDiagram PS->>CP: True ``` -Similar sequence applies when both local and remote marks were persisted but Control Plane still didnt receive a response. +Similar sequence applies when both local and remote marks were persisted but Control Plane still didn't receive a response. If pageserver crashes after both mark files were deleted then it will reply to control plane status poll request with 404 which should be treated by control plane as success. @@ -187,7 +187,7 @@ If pageseserver is lost then the deleted tenant should be attached to different ##### Restrictions for tenant that is in progress of being deleted -I propose to add another state to tenant/timeline - PendingDelete. This state shouldnt allow executing any operations aside from polling the deletion status. +I propose to add another state to tenant/timeline - PendingDelete. This state shouldn't allow executing any operations aside from polling the deletion status. #### Summary @@ -237,7 +237,7 @@ New branch gets created PS1 starts up (is it possible or we just recycle it?) PS1 is unaware of the new branch. It can either fall back to s3 ls, or ask control plane. -So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage neeeds to ask control plane. +So here comes the dependency of storage on control plane. During restart storage needs to know which timelines are valid for operation. If there is nothing on s3 that can answer that question storage needs to ask control plane. ### Summary @@ -250,7 +250,7 @@ Cons: Pros: -- Easier to reason about if you dont have to account for pageserver restarts +- Easier to reason about if you don't have to account for pageserver restarts ### Extra notes @@ -262,7 +262,7 @@ Delayed deletion can be done with both approaches. As discussed with Anna (@step After discussion in comments I see that we settled on two options (though a bit different from ones described in rfc). First one is the same - pageserver owns as much as possible. The second option is that pageserver owns markers thing, but actual deletion happens in control plane by repeatedly calling ls + delete. -To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge abouth paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesnt support shutting down pageservers, which are separate docker containers there instead of just processes. +To my mind the only benefit of the latter approach is possible code reuse between safekeepers and pageservers. Otherwise poking around integrating s3 library into control plane, configuring shared knowledge about paths in s3 - are the downsides. Another downside of relying on control plane is the testing process. Control plane resides in different repository so it is quite hard to test pageserver related changes there. e2e test suite there doesn't support shutting down pageservers, which are separate docker containers there instead of just processes. With pageserver owning everything we still give the retry logic to control plane but its easier to duplicate if needed compared to sharing inner s3 workings. We will have needed tests for retry logic in neon repo. diff --git a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md index 836c91fb25..97e62bf8c6 100644 --- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md +++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md @@ -75,7 +75,7 @@ sequenceDiagram ``` At this point it is not possible to restore the state from index, it contains L2 which -is no longer available in s3 and doesnt contain L3 added by compaction by the +is no longer available in s3 and doesn't contain L3 added by compaction by the first pageserver. So if any of the pageservers restart, initial sync will fail (or in on-demand world it will fail a bit later during page request from missing layer) @@ -171,7 +171,7 @@ sequenceDiagram Another problem is a possibility of concurrent branch creation calls. -I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state. +I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we don't need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state. ## Simplistic approach diff --git a/docs/rfcs/024-extension-loading.md b/docs/rfcs/024-extension-loading.md index 26ba4f7927..7e243b23e3 100644 --- a/docs/rfcs/024-extension-loading.md +++ b/docs/rfcs/024-extension-loading.md @@ -55,7 +55,7 @@ When PostgreSQL requests a file, `compute_ctl` downloads it. PostgreSQL requests files in the following cases: - When loading a preload library set in `local_preload_libraries` - When explicitly loading a library with `LOAD` -- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files))) +- When creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files))) #### Summary diff --git a/docs/rfcs/025-generation-numbers.md b/docs/rfcs/025-generation-numbers.md index 6a0131c66a..dfc8529d2d 100644 --- a/docs/rfcs/025-generation-numbers.md +++ b/docs/rfcs/025-generation-numbers.md @@ -26,7 +26,7 @@ plane guarantee prevents robust response to failures, as if a pageserver is unre we may not detach from it. The mechanism in this RFC fixes this, by making it safe to attach to a new, different pageserver even if an unresponsive pageserver may be running. -Futher, lack of safety during split-brain conditions blocks two important features where occasional +Further lack of safety during split-brain conditions blocks two important features where occasional split-brain conditions are part of the design assumptions: - seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029)) @@ -490,11 +490,11 @@ The above makes it safe for control plane to change the assignment of tenant to pageserver in control plane while a timeline creation is ongoing. The reason is that the creation request against the new assigned pageserver uses a new generation number. However, care must be taken by control plane -to ensure that a "timeline creation successul" response from some pageserver +to ensure that a "timeline creation successful" response from some pageserver is checked for the pageserver's generation for that timeline's tenant still being the latest. If it is not the latest, the response does not constitute a successful timeline creation. It is acceptable to discard such responses, the scrubber will clean up the S3 state. -It is better to issue a timelien deletion request to the stale attachment. +It is better to issue a timeline deletion request to the stale attachment. #### Timeline Deletion @@ -633,7 +633,7 @@ As outlined in the Part 1 on correctness, it is critical that deletions are only executed once the key is not referenced anywhere in S3. This property is obviously upheld by the scheme above. -#### We Accept Object Leakage In Acceptable Circumcstances +#### We Accept Object Leakage In Acceptable Circumstances If we crash in the flow above between (2) and (3), we lose track of unreferenced object. Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk. diff --git a/docs/rfcs/026-pageserver-s3-mvcc.md b/docs/rfcs/026-pageserver-s3-mvcc.md index 2a8c925781..473d5a2bd0 100644 --- a/docs/rfcs/026-pageserver-s3-mvcc.md +++ b/docs/rfcs/026-pageserver-s3-mvcc.md @@ -162,7 +162,7 @@ struct Tenant { ... txns: HashMap, - // the most recently started txn's id; only most recently sarted can win + // the most recently started txn's id; only most recently started can win next_winner_txn: Option, } struct Transaction { @@ -186,7 +186,7 @@ A transaction T in state Committed has subsequent transactions that may or may n So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged: -- Commited: delete objects on the deadlist. +- Committed: delete objects on the deadlist. - We don’t need a LIST request here, the deadlist is sufficient. So, it’s really cheap. - This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T ’s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they don’t matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection they’re destined to get anyways. 404’s for RejectAcknowledged is handled below. - RejectAcknowledged: delete all objects created in that txn, and discard deadlists. @@ -242,15 +242,15 @@ If a pageserver is unresponsive from Control Plane’s / Compute’s perspective At this point, availability is restored and user pain relieved. -What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure: +What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it becomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure: 1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above. 2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT): 1. Inspect the instance, investigate logs, understand root cause. 2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC. - 3. Use below procedure to decomission pageserver. + 3. Use below procedure to decommission pageserver. -### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive) +### Decommissioning A Pageserver (Dead or Alive-but-Unresponsive) The solution, enabled by this proposal: @@ -310,7 +310,7 @@ Issues that we discussed: 1. In abstract terms, this proposal provides a linearized history for a given S3 prefix. 2. In concrete terms, this proposal provides a linearized history per tenant. 3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history. -4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************ +4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written proposal like this one:************************************************************************************ 1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT 2. @Dmitry Rodionov : 3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment. diff --git a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md index 2c6b46eabe..e18b7c16c9 100644 --- a/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md +++ b/docs/rfcs/027-crash-consistent-layer-map-through-index-part.md @@ -54,7 +54,7 @@ If the compaction algorithm doesn't change between the two compaction runs, is d *However*: 1. the file size of the overwritten L1s may not be identical, and 2. the bit pattern of the overwritten L1s may not be identical, and, -3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite +3. in the future, we may want to make the compaction code non-deterministic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted). @@ -63,7 +63,7 @@ But node B based its world view on the version of node A's `index_part.json` fro That earlier `index_part.json`` contained the file size of the pre-overwrite L1. If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1. Effectively, the data in the L1 has become inaccessible to node B. -If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem. +If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same problem. If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems. @@ -121,7 +121,7 @@ Multi-object changes that previously created and removed files in timeline dir a * atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic * local timeline dir state: * irrelevant for layer map content => irrelevant for atomic updates / crash consistency - * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them + * if we crash after index part PUT, local layer files will be used, so, no on-demand downloads needed for them * if we crash before index part PUT, local layer files will be deleted ## Trade-Offs @@ -140,7 +140,7 @@ Assuming upload queue allows for unlimited queue depth (that's what it does toda * wal ingest: currently unbounded * L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()` * Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M. - * In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`. + * In practice, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`. * image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))` * I have no intuition how expensive / long-running it is in reality. * gc: `update_gc_info`` work (not substantial, AFAIK) @@ -158,7 +158,7 @@ Pageserver crashes are very rare ; it would likely be acceptable to re-do the lo However, regular pageserver restart happen frequently, e.g., during weekly deploys. In general, pageserver restart faces the problem of tenants that "take too long" to shut down. -They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down. +They are a problem because other tenants that shut down quickly are unavailable while we wait for the slow tenants to shut down. We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file). A longer budget would expose tenants that are done early to a longer downtime. A short budget would risk throwing away more work that'd have to be re-done after restart. @@ -236,7 +236,7 @@ tenants/$tenant/timelines/$timeline/$key_and_lsn_range tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range ``` -To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`. +To guarantee uniqueness, the unique number is a sequence number, stored in `index_part.json`. This alternative does not solve atomic layer map updates. In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers. @@ -246,11 +246,11 @@ We'd need to write a deduplication pass that checks if perfectly overlapping lay However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC. So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3). -But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute. +But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more acute. The proposed design in this RFC addresses both. So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top. -That way, we avoid a phase where the crash-during-compaction problem is accute. +That way, we avoid a phase where the crash-during-compaction problem is acute. ## Related issues diff --git a/docs/rfcs/028-pageserver-migration.md b/docs/rfcs/028-pageserver-migration.md index f708f641aa..17ef9aef52 100644 --- a/docs/rfcs/028-pageserver-migration.md +++ b/docs/rfcs/028-pageserver-migration.md @@ -596,4 +596,4 @@ pageservers are updated to be aware of it. As well as simplifying implementation, putting heatmaps in S3 will be useful for future analytics purposes -- gathering aggregated statistics on activity -pattersn across many tenants may be done directly from data in S3. +patterns across many tenants may be done directly from data in S3. diff --git a/docs/rfcs/029-pageserver-wal-disaster-recovery.md b/docs/rfcs/029-pageserver-wal-disaster-recovery.md index 15ebd72bfe..229e40100e 100644 --- a/docs/rfcs/029-pageserver-wal-disaster-recovery.md +++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md @@ -147,7 +147,7 @@ Separating corrupt writes from non-corrupt ones is a hard problem in general, and if the application was involved in making the corrupt write, a recovery would also involve the application. Therefore, corruption that has made it into the WAL is outside of the scope of this feature. However, the WAL replay can be -issued to right before the point in time where the corruption occured. Then the +issued to right before the point in time where the corruption occurred. Then the data loss is isolated to post-corruption writes only. ## Impacted components (e.g. pageserver, safekeeper, console, etc) @@ -161,7 +161,7 @@ limits and billing we apply to existing timelines. ## Proposed implementation -The first problem to keep in mind is the reproducability of `initdb`. +The first problem to keep in mind is the reproducibility of `initdb`. So an initial step would be to upload `initdb` snapshots to S3. After that, we'd have the endpoint spawn a background process which diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md index d4017471b7..093a964f38 100644 --- a/docs/rfcs/030-vectored-timeline-get.md +++ b/docs/rfcs/030-vectored-timeline-get.md @@ -69,7 +69,7 @@ However, unlike above, an ideal solution will * This means, read each `DiskBtree` page at most once. * Facilitate merging of the reads we issue to the OS and eventually NVMe. -Each of these items above represents a signficant amount of work. +Each of these items above represents a significant amount of work. ## Performance diff --git a/docs/rfcs/031-sharding-static.md b/docs/rfcs/031-sharding-static.md new file mode 100644 index 0000000000..fe009b8660 --- /dev/null +++ b/docs/rfcs/031-sharding-static.md @@ -0,0 +1,408 @@ +# Sharding Phase 1: Static Key-space Sharding + +## Summary + +To enable databases with sizes approaching the capacity of a pageserver's disk, +it is necessary to break up the storage for the database, or _shard_ it. + +Sharding in general is a complex area. This RFC aims to define an initial +capability that will permit creating large-capacity databases using a static configuration +defined at time of Tenant creation. + +## Motivation + +Currently, all data for a Tenant, including all its timelines, is stored on a single +pageserver. The local storage required may be several times larger than the actual +database size, due to LSM write inflation. + +If a database is larger than what one pageserver can hold, then it becomes impossible +for the pageserver to hold it in local storage, as it must do to provide service to +clients. + +### Prior art + +In Neon: + +- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4 +- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843 +- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677 + +Prior art in other distributed systems is too broad to capture here: pretty much +any scale out storage system does something like this. + +## Requirements + +- Enable creating a large (for example, 16TiB) database without requiring dedicated + pageserver nodes. +- Share read/write bandwidth costs for large databases across pageservers, as well + as storage capacity, in order to avoid large capacity databases acting as I/O hotspots + that disrupt service to other tenants. +- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres + does not write out a single contiguous ranges of page numbers. + +_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database +that a user might create on a current-gen enterprise SSD should also work well on +Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the +pageserver backend is not the limiting factor in the database size_. + +## Non Goals + +- Independently distributing timelines within the same tenant. If a tenant has many + timelines, then sharding may be a less efficient mechanism for distributing load than + sharing out timelines between pageservers. +- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only, + based on the idea that separate mechanisms will make sense for each dimension. + +## Impacted Components + +pageserver, control plane, postgres/smgr + +## Terminology + +**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store, +the page number is the key in that store. `Key` is a literal data type in existing code. + +**LSN dimension**: this just means the range of LSNs (history), when talking about the range +of keys and LSNs as a two dimensional space. + +## Implementation + +### Key sharding vs. LSN sharding + +When we think of sharding across the two dimensional key/lsn space, this is an +opportunity to think about how the two dimensions differ: + +- Sharding the key space distributes the _write_ workload of ingesting data + and compacting. This work must be carefully managed so that exactly one + node owns a given key. +- Sharding the LSN space distributes the _historical read_ workload. This work + can be done by anyone without any special coordination, as long as they can + see the remote index and layers. + +The key sharding is the harder part, and also the more urgent one, to support larger +capacity databases. Because distributing historical LSN read work is a relatively +simpler problem that most users don't have, we defer it to future work. It is anticipated +that some quite simple P2P offload model will enable distributing work for historical +reads: a node which is low on space can call out to peer to ask it to download and +serve reads from a historical layer. + +### Key mapping scheme + +Having decided to focus on key sharding, we must next decide how we will map +keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise +between data locality and avoiding entire large relations mapping to the same shard. + +We will define two spaces: + +- Key space: unsigned integer +- Shard space: integer from 0 to N-1, where we have N shards. + +### Key -> Shard mapping + +Keys are currently defined in the pageserver's getpage@lsn interface as follows: + +``` +pub struct Key { + pub field1: u8, + pub field2: u32, + pub field3: u32, + pub field4: u32, + pub field5: u8, + pub field6: u32, +} + + +fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} +``` + +_Note: keys for relation metadata are ignored here, as this data will be mirrored to all +shards. For distribution purposes, we only care about user data keys_ + +The properties we want from our Key->Shard mapping are: + +- Locality in `blknum`, such that adjacent `blknum` will usually map to + the same stripe and consequently land on the same shard, even though the overall + collection of blocks in a relation will be spread over many stripes and therefore + many shards. +- Avoid the same blknum on different relations landing on the same stripe, so that + with many small relations we do not end up aliasing data to the same stripe/shard. +- Avoid vulnerability to aliasing in the values of relation identity fields, such that + if there are patterns in the value of `relnode`, these do not manifest as patterns + in data placement. + +To accomplish this, the blknum is used to select a stripe, and stripes are +assigned to shards in a pseudorandom order via a hash. The motivation for +pseudo-random distribution (rather than sequential mapping of stripe to shard) +is to avoid I/O hotspots when sequentially reading multiple relations: we don't want +all relations' stripes to touch pageservers in the same order. + +To map a `Key` to a shard: + +- Hash the `Key` field 4 (relNode). +- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the + hash of this with the hash from the previous step. +- The total hash modulo the shard count gives the shard holding this key. + +Why don't we use the other fields in the Key? + +- We ignore `forknum` for key mapping, because it distinguishes different classes of data + in the same relation, and we would like to keep the data in a relation together. +- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created + database's blocks differ only by spcNode and dbNode from the original. To enable running + this type of creation without cross-pageserver communication, we must ensure that these + blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash. + +### Data placement examples + +For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards +and a stripe size of 32k pages: + +- A single large relation: `blknum` division will break the data up into 4096 + stripes, which will be scattered across the shards. +- 4096 relations of of 32k pages each: each relation will map to exactly one stripe, + and that stripe will be placed according to the hash of the key fields 4. The + data placement will be statistically uniform across shards. + +Data placement will be more uneven on smaller databases: + +- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance + that both relations land on the same shard and no data lands on the other shard. +- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double + the data of the other four shards. + +These uneven cases for small amounts of data do not matter, as long as the stripe size +is an order of magnitude smaller than the amount of data we are comfortable holding +in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if +a tenant has some shards with 256MB size and some shards with 512MB size, even though +the standard deviation of shard size within the tenant is very high. Our key mapping +scheme provides a statistical guarantee that as the tenant's overall data size increases, +uniformity of placement will improve. + +### Important Types + +#### `ShardIdentity` + +Provides the information needed to know whether a particular key belongs +to a particular shard: + +- Layout version +- Stripe size +- Shard count +- Shard index + +This structure's size is constant. Note that if we had used a differnet key +mapping scheme such as consistent hashing with explicit hash ranges assigned +to each shard, then the ShardIdentity's size would grow with the shard count: the simpler +key mapping scheme used here enables a small fixed size ShardIdentity. + +### Pageserver changes + +#### Structural + +Everywhere the Pageserver currently deals with Tenants, it will move to dealing with +`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part +of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity` +covers the whole keyspace. + +When the pageserver writes layers and index_part.json to remote storage, it must +include the shard index & count in the name, to avoid collisions (the count is +necessary for future-proofing: the count will vary in time). These keys +will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work +exactly the same for TenantShards as it does for Tenants today: each shard will have +its own generation number. + +#### Storage Format: Keys + +For tenants with >1 shard, layer files implicitly become sparse: within the key +range described in the layer name, the layer file for a shard will only hold the +content relevant to stripes assigned to the shard. + +For this reason, the LayerFileName within a tenant is no longer unique: different shards +may use the same LayerFileName to refer to different data. We may solve this simply +by including the shard number in the keys used for layers. + +The shard number will be included as a prefix (as part of tenant ID), like this: + +`pageserver/v1/tenants/-/timelines//-` + +`pageserver/v1/tenants/-/timelines//index_part.json-` + +Reasons for this particular format: + +- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere + we construct a layer file name), and enables efficient listing of index_parts within + a particular shard-timeline prefix. +- Including the shard _count_ as well as shard number means that in future when we implement + shard splitting, it will be possible for a parent shard and one of its children to write + the same layer file without a name collision. For example, a parent shard 0_1 might split + into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part + that is distinct from what shard 0_1 would have written at the same place. + +In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient, +and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`, +for example a single-shard tenant's prefix will be `0001`. + +For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0, +and use this as a cue to construct paths with no prefix at all. + +#### Storage Format: Indices + +In the phase 1 described in this RFC, shards only reference layers they write themselves. However, +when we implement shard splitting in future, it will be useful to enable shards to reference layers +written by other shards (specifically the parent shard during a split), so that shards don't +have to exhaustively copy all data into their own shard-prefixed keys. + +To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count) +tuple on each layer, such that it can construct paths for layers written by other shards. This +naturally raises the question of who "owns" such layers written by ancestral shards: this problem +will be addressed in phase 2. + +For backward compatibility, any index entry without shard information will be assumed to be +in the legacy shardidentity. + +#### WAL Ingest + +In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter +it down to the pages relevant to their shard: + +- For ordinary user data writes, only retain a write if it matches the ShardIdentity +- For metadata describing relations etc, all shards retain these writes. + +The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn: +one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards, +and have only the 0th shard populate remote_consistent_lsn. However, this is relatively +expensive: if the safekeeper can be made shard-aware then it could be taught to use +the max() of all shards' remote_consistent_lsns to decide when to trim the WAL. + +#### Compaction/GC + +No changes needed. + +The pageserver doesn't have to do anything special during compaction +or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity. +This will result in sparse layer files, containing keys only in the stripes that this +shard owns. Where optimizations currently exist in compaction for spotting "gaps" in +the key range, these should be updated to ignore gaps that are due to sharding, to +avoid spuriously splitting up layers ito stripe-sized pieces. + +### Compute Endpoints + +Compute endpoints will need to: + +- Accept a vector of connection strings as part of their configuration from the control plane +- Route pageserver requests according to mapping the hash of key to the correct + entry in the vector of connection strings. + +Doing this in compute rather than routing requests via a single pageserver is +necessary to enable sharding tenants without adding latency from extra hops. + +### Control Plane + +Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will +be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing +tenants. + +Tenant lifecycle operations like deletion will require fanning-out to all the shards +in the tenant. The same goes for timeline creation and deletion: a timeline should +not be considered created until it has been created in all shards. + +#### Selectively enabling sharding for large tenants + +Initially, we will explicitly enable sharding for large tenants only. + +In future, this hint mechanism will become optional when we implement automatic +re-sharding of tenants. + +## Future Phases + +This section exists to indicate what will likely come next after this phase. + +Phases 2a and 2b are amenable to execution in parallel. + +### Phase 2a: WAL fan-out + +**Problem**: when all shards consume the whole WAL, the network bandwidth used +for transmitting the WAL from safekeeper to pageservers is multiplied by a factor +of the shard count. + +Network bandwidth is not our most pressing bottleneck, but it is likely to become +a problem if we set a modest shard count (~8) on a significant number of tenants, +especially as those larger tenants which we shard are also likely to have higher +write bandwidth than average. + +### Phase 2b: Shard Splitting + +**Problem**: the number of shards in a tenant is defined at creation time and cannot +be changed. This causes excessive sharding for most small tenants, and an upper +bound on scale for very large tenants. + +To address this, a _splitting_ feature will later be added. One shard can split its +data into a number of children by doing a special compaction operation to generate +image layers broken up child-shard-wise, and then writing out an `index_part.json` for +each child. This will then require external coordination (by the control plane) to +safely attach these new child shards and then move them around to distribute work. +The opposite _merging_ operation can also be imagined, but is unlikely to be implemented: +once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify +the risk/complexity of implementing such a rarely-encountered scenario. + +### Phase N (future): distributed historical reads + +**Problem**: while sharding based on key is good for handling changes in overall +database size, it is less suitable for spiky/unpredictable changes in the read +workload to historical layers. Sudden increases in historical reads could result +in sudden increases in local disk capacity required for a TenantShard. + +Example: the extreme case of this would be to run a tenant for a year, then create branches +with ancestors at monthly intervals. This could lead to a sudden 12x inflation in +the on-disk capacity footprint of a TenantShard, since it would be serving reads +from all those disparate historical layers. + +If we can respond fast enough, then key-sharding a tenant more finely can help with +this, but splitting may be a relatively expensive operation and the increased historical +read load may be transient. + +A separate mechanism for handling heavy historical reads could be something like +a gossip mechanism for pageservers to communicate +about their workload, and then a getpageatlsn offload mechanism where one pageserver can +ask another to go read the necessary layers from remote storage to serve the read. This +requires relativly little coordination because it is read-only: any node can service any +read. All reads to a particular shard would still flow through one node, but the +disk capactity & I/O impact of servicing the read would be distributed. + +## FAQ/Alternatives + +### Why stripe the data, rather than using contiguous ranges of keyspace for each shard? + +When a database is growing under a write workload, writes may predominantly hit the +end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user +is intensively re-writing a particular relation, if that relation lived in a particular +shard then it would not achieve our goal of distributing the write work across shards. + +### Why not proxy read requests through one pageserver, so that endpoints don't have to change? + +1. This would not achieve scale-out of network bandwidth: a busy tenant with a large + database would still cause a load hotspot on the pageserver routing its read requests. +2. The additional hop through the "proxy" pageserver would add latency and overall + resource cost (CPU, network bandwidth) + +### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers + +In this model, there would be no explicit sharding of work, but the pageserver to which +a tenant is attached would not hold all layers on its disk: instead, it would call out +to peers to have them store some layers, and call out to those peers to request reads +in those layers. + +This mechanism will work well for distributing work in the LSN dimension, but in the key +space dimension it has the major limitation of requiring one node to handle all +incoming writes, and compactions. Even if the write workload for a large database +fits in one pageserver, it will still be a hotspot and such tenants may still +de-facto require their own pageserver. diff --git a/docs/rfcs/032-shard-splitting.md b/docs/rfcs/032-shard-splitting.md new file mode 100644 index 0000000000..d5fbda8415 --- /dev/null +++ b/docs/rfcs/032-shard-splitting.md @@ -0,0 +1,479 @@ +# Shard splitting + +## Summary + +This RFC describes a new pageserver API for splitting an existing tenant shard into +multiple shards, and describes how to use this API to safely increase the total +shard count of a tenant. + +## Motivation + +In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale +tenants beyond the capacity of a single pageserver by breaking up the key space +into stripes, and distributing these stripes across many pageservers. However, +the shard count was defined once at tenant creation time and not varied thereafter. + +In practice, the expected size of a database is rarely known at creation time, and +it is inefficient to enable sharding for very small tenants: we need to be +able to create a tenant with a small number of shards (such as 1), and later expand +when it becomes clear that the tenant has grown in size to a point where sharding +is beneficial. + +### Prior art + +Many distributed systems have the problem of choosing how many shards to create for +tenants that do not specify an expected size up-front. There are a couple of general +approaches: + +- Write to a key space in order, and start a new shard when the highest key advances + past some point. This doesn't work well for Neon, because we write to our key space + in many different contiguous ranges (per relation), rather than in one contiguous + range. To adapt to this kind of model, we would need a sharding scheme where each + relation had its own range of shards, which would be inefficient for the common + case of databases with many small relations. +- Monitor the system, and automatically re-shard at some size threshold. For + example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py) + component monitors the size of each RADOS Pool, and adjusts the number of Placement + Groups (Ceph's shard equivalent). + +## Requirements + +- A configurable capacity limit per-shard is enforced. +- Changes in shard count do not interrupt service beyond requiring postgres + to reconnect (i.e. milliseconds). +- Human being does not have to choose shard count + +## Non Goals + +- Shard splitting is always a tenant-global operation: we will not enable splitting + one shard while leaving others intact. +- The inverse operation (shard merging) is not described in this RFC. This is a lower + priority than splitting, because databases grow more often than they shrink, and + a database with many shards will still work properly if the stored data shrinks, just + with slightly more overhead (e.g. redundant WAL replication) +- Shard splitting is only initiated based on capacity bounds, not load. Splitting + a tenant based on load will make sense for some medium-capacity, high-load workloads, + but is more complex to reason about and likely is not desirable until we have + shard merging to reduce the shard count again if the database becomes less busy. + +## Impacted Components + +pageserver, storage controller + +(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment) + +## Terminology + +**Parent** shards are the shards that exist before a split. **Child** shards are +the new shards created during a split. + +**Shard** is synonymous with _tenant shard_. + +**Shard Index** is the 2-tuple of shard number and shard count, written in +paths as {:02x}{:02x}, e.g. `0001`. + +## Background + +In the implementation section, a couple of existing aspects of sharding are important +to remember: + +- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is + a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local + storage paths, and remote index metadata. +- Remote layer file paths contain the shard index of the shard that created them, and + remote indices contain the same index to enable building the layer file path. A shard's + index may reference layers that were created by another shard. +- Local tenant shard directories include the shard index. All layers downloaded by + a tenant shard are stored in this shard-prefixed path, even if those layers were + initially created by another shard: tenant shards do not read and write one anothers' + paths. +- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant. + This is for historical reasons and will be cleaned up in future, but the existing + name is used here to help comprehension when reading code. + +## Implementation + +Note: this section focuses on the correctness of the core split process. This will +be fairly inefficient in a naive implementation, and several important optimizations +are described in a later section. + +There are broadly two parts to the implementation: + +1. The pageserver split API, which splits one shard on one pageserver +2. The overall tenant split proccess which is coordinated by the storage controller, + and calls into the pageserver split API as needed. + +### Pageserver Split API + +The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split` +that takes the new total shard count in the body. + +The pageserver split API operates on one tenant shard, on one pageserver. External +coordination is required to use it safely, this is described in the later +'Split procedure' section. + +#### Preparation + +First identify the shard indices for the new child shards. These are deterministic, +calculated from the parent shard's index, and the number of children being created (this +is an input to the API, and validated to be a power of two). In a trivial example, splitting +0001 in two always results in 0002 and 0102. + +Child shard indices are chosen such that the childrens' parts of the keyspace will +be subsets of the parent's parts of the keyspace. + +#### Step 1: write new remote indices + +In remote storage, splitting is very simple: we may just write new index_part.json +objects for each child shard, containing exactly the same layers as the parent shard. + +The children will have more data than they need, but this avoids any exhausive +re-writing or copying of layer files. + +The index key path includes a generation number: the parent shard's current +attached generation number will also be used for the child shards' indices. This +makes the operation safely retryable: if everything crashes and restarts, we may +call the split API again on the parent shard, and the result will be some new remote +indices for the child shards, under a higher generation number. + +#### Step 2: start new `Tenant` objects + +A new `Tenant` object may be instantiated for each child shard, while the parent +shard still exists. When calling the tenant_spawn function for this object, +the remote index from step 1 will be read, and the child shard will start +to ingest WAL to catch up from whatever was in the remote storage at step 1. + +We now wait for child shards' WAL ingestion to catch up with the parent shard, +so that we can safely tear down the parent shard without risking an availability +gap to clients reading recent LSNs. + +#### Step 3: tear down parent `Tenant` object + +Once child shards are running and have caught up with WAL ingest, we no longer +need the parent shard. Note that clients may still be using it -- when we +shut it down, any page_service handlers will also shut down, causing clients +to disconnect. When the client reconnects, it will re-lookup the tenant, +and hit the child shard instead of the parent (shard lookup from page_service +should bias toward higher ShardCount shards). + +Note that at this stage the page service client has not yet been notified of +any split. In the trivial single split example: + +- Shard 0001 is gone: Tenant object torn down +- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live. +- Clients will continue to connect to that server thinking that shard 0001 is there, + and all requests will work, because any key that was in shard 0001 is definitely + available in either shard 0002 or shard 0102. +- Eventually, the storage controller (not the pageserver) will decide to migrate + some child shards away: at that point it will do a live migration, ensuring + that the client has an updated configuration before it detaches anything + from the original server. + +#### Complete + +When we send a 200 response to the split request, we are promising the caller: + +- That the child shards are persistent in remote storage +- That the parent shard has been shut down + +This enables the caller to proceed with the overall shard split operation, which +may involve other shards on other pageservers. + +### Storage Controller Split procedure + +Splitting a tenant requires calling the pageserver split API, and tracking +enough state to ensure recovery + completion in the event of any component (pageserver +or storage controller) crashing (or request timing out) during the split. + +1. call the split API on all existing shards. Ensure that the resulting + child shards are pinned to their pageservers until _all_ the split calls are done. + This pinning may be implemented as a "split bit" on the tenant shards, that + blocks any migrations, and also acts as a sign that if we restart, we must go + through some recovery steps to resume the split. +2. Once all the split calls are done, we may unpin the child shards (clear + the split bit). The split is now complete: subsequent steps are just migrations, + not strictly part of the split. +3. Try to schedule new pageserver locations for the child shards, using + a soft anti-affinity constraint to place shards from the same tenant onto different + pageservers. + +Updating computes about the new shard count is not necessary until we migrate +any of the child shards away from the parent's location. + +### Recovering from failures + +#### Rolling back an incomplete split + +An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers, +and detaching child shards. This will lose any WAL ingested into the children after the parents +were detached earlier, but the parents will catch up. + +No special pageserver API is needed for this. From the storage controllers point of view, the +procedure is: + +1. For all parent shards in the tenant, ensure they are attached +2. For all child shards, ensure they are not attached +3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards. + +Any remote storage content for child shards is left behind. This is similar to other cases where +we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an +index that references it). Future online scrub/cleanup functionality can remove these objects, or +they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix, +which would include any child shards that were rolled back. + +If any timelines had been created on child shards, they will be lost when rolling back. To mitigate +this, we will **block timeline creation during splitting**, so that we can safely roll back until +the split is complete, without risking losing timelines. + +Rolling back an incomplete split will happen automatically if a split fails due to some fatal +reason, and will not be accessible via an API: + +- A pageserver fails to complete its split API request after too many retries +- A pageserver returns a fatal unexpected error such as 400 or 500 +- The storage controller database returns a non-retryable error +- Some internal invariant is violated in the storage controller split code + +#### Rolling back a complete split + +A complete shard split may be rolled back similarly to an incomplete split, with the following +modifications: + +- The parent shards will no longer exist in the storage controller database, so these must + be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This + may be accomplished either by probing in S3, or by retaining some tombstone state for deleted + shards in the storage controller database. +- Any timelines that were created after the split complete will disappear when rolling back + to the tenant shards. For this reason, rolling back after a complete split should only + be done due to serious issues where loss of recently created timelines is acceptable, or + in cases where we have confirmed that no timelines were created in the intervening period. +- Parent shards' layers must not have been deleted: this property will come "for free" when + we first roll out sharding, by simply not implementing deletion of parent layers after + a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the + Optimizations section), it should apply a TTL to layers such that we have a + defined walltime window in which rollback will be possible. + +The storage controller will expose an API for rolling back a complete split, for use +in the field if we encounter some critical bug with a post-split tenant. + +#### Retrying API calls during Pageserver Restart + +When a pageserver restarts during a split API call, it may witness on-disk content for both parent and +child shards from an ongoing split. This does not intrinsically break anything, and the +pageserver may include all these shards in its `/re-attach` request to the storage controller. + +In order to support such restarts, it is important that the storage controller stores +persistent records of each child shard before it calls into a pageserver, as these child shards +may require generation increments via a `/re-attach` request. + +The pageserver restart will also result in a failed API call from the storage controller's point +of view. Recall that if _any_ pageserver fails to split, the overall split operation may not +complete, and all shards must remain pinned to their current pageserver locations until the +split is done. + +The pageserver API calls during splitting will retry on transient errors, so that +short availability gaps do not result in a failure of the overall operation. The +split in progress will be automatically rolled back if the threshold for API +retries is reached (e.g. if a pageserver stays offline for longer than a typical +restart). + +#### Rollback on Storage Controller Restart + +On startup, the storage controller will inspect the split bit for tenant shards that +it loads from the database. If any splits are in progress: + +- Database content will be reverted to the parent shards +- Child shards will be dropped from memory +- The parent and child shards will be included in the general startup reconciliation that + the storage controller does: any child shards will be detached from pageservers because + they don't exist in the storage controller's expected set of shards, and parent shards + will be attached if they aren't already. + +#### Storage controller API request failures/retries + +The split request handler will implement idempotency: if the [`Tenant`] requested to split +doesn't exist, we will check for the would-be child shards, and if they already exist, +we consider the request complete. + +If a request is retried while the original request is still underway, then the split +request handler will notice an InProgress marker in TenantManager, and return 503 +to encourage the client to backoff/retry. This is the same as the general pageserver +API handling for calls that try to act on an InProgress shard. + +#### Compute start/restart during a split + +If a compute starts up during split, it will be configured with the old sharding +configuration. This will work for reads irrespective of the progress of the split +as long as no child hards have been migrated away from their original location, and +this is guaranteed in the split procedure (see earlier section). + +#### Pageserver fails permanently during a split + +If a pageserver permanently fails (i.e. the storage controller availability state for it +goes to Offline) while a split is in progress, the splitting operation will roll back, and +during the roll back it will skip any API calls to the offline pageserver. If the offline +pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API). + +### Handling secondary locations + +For correctness, it is not necessary to split secondary locations. We can simply detach +the secondary locations for parent shards, and then attach new secondary locations +for child shards. + +Clearly this is not optimal, as it will result in re-downloads of layer files that +were already present on disk. See "Splitting secondary locations" + +### Conditions to trigger a split + +The pageserver will expose a new API for reporting on shards that are candidates +for split: this will return a top-N report of the largest tenant shards by +physical size (remote size). This should exclude any tenants that are already +at the maximum configured shard count. + +The API would look something like: +`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size` + +The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds). + +A split operation will be started when the tenant exceeds some threshold. This threshold +should be _less than_ how large we actually want shards to be, perhaps much less. That's to +minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't +wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing +tenant size distribution may be useful here: if we can make a statement like "usually, if +a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might +make our policy to split a tenant at 20GiB. + +The finest split we can do is by factors of two, but we can do higher-cardinality splits +too, and this will help to reduce the overhead of repeatedly re-splitting a tenant +as it grows. An example of a very simple heuristic for early deployment of the splitting +feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that +would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had +split a tenant, it will not need re-splitting soon after. + +## Optimizations + +### Flush parent shard to remote storage during split + +Any data that is in WAL but not remote storage at time of split will need +to be replayed by child shards when they start for the first time. To minimize +this work, we may flush the parent shard to remote storage before writing the +remote indices for child shards. + +It is important that this flush is subject to some time bounds: we may be splitting +in response to a surge of write ingest, so it may be time-critical to split. A +few seconds to flush latest data should be sufficient to optimize common cases without +running the risk of holding up a split for a harmful length of time when a parent +shard is being written heavily. If the flush doesn't complete in time, we may proceed +to shut down the parent shard and carry on with the split. + +### Hard linking parent layers into child shard directories + +Before we start the Tenant objects for child shards, we may pre-populate their +local storage directories with hard links to the layer files already present +in the parent shard's local directory. When the child shard starts and downloads +its remote index, it will find all those layer files already present on local disk. + +This avoids wasting download capacity and makes splitting faster, but more importantly +it avoids taking up a factor of N more disk space when splitting 1 shard into N. + +This mechanism will work well in typical flows where shards are migrated away +promptly after a split, but for the general case including what happens when +layers are evicted and re-downloaded after a split, see the 'Proactive compaction' +section below. + +### Filtering during compaction + +Compaction, especially image layer generation, should skip any keys that are +present in a shard's layer files, but do not match the shard's ShardIdentity's +is_key_local() check. This avoids carrying around data for longer than necessary +in post-split compactions. + +This was already implemented in https://github.com/neondatabase/neon/pull/6246 + +### Proactive compaction + +In remote storage, there is little reason to rewrite any data on a shard split: +all the children can reference parent layers via the very cheap write of the child +index_part.json. + +In local storage, things are more nuanced. During the initial split there is no +capacity cost to duplicating parent layers, if we implement the hard linking +optimization described above. However, as soon as any layers are evicted from +local disk and re-downloaded, the downloaded layers will not be hard-links any more: +they'll have real capacity footprint. That isn't a problem if we migrate child shards +away from the parent node swiftly, but it risks a significant over-use of local disk +space if we do not. + +For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of +the shards elsewhere, then churned all the layers in all the shards via eviction, +then we would blow up the storage capacity used on the node by 8x. If we're splitting +a 100GB shard, that could take the pageserver to the point of exhausting disk space. + +To avoid this scenario, we could implement a special compaction mode where we just +read historic layers, drop unwanted keys, and write back the layer file. This +is pretty expensive, but useful if we have split a large shard and are not going to +migrate the child shards away. + +The heuristic conditions for triggering such a compaction are: + +- A) eviction plus time: if a child shard + has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load. +- B) resident size plus time: we may inspect the resident layers and calculate how + many of them include the overhead of storing pre-split keys. After some time + threshold (different to the one in case A) we still have such layers occupying + local disk space, then we should proactively compact them. + +### Cleaning up parent-shard layers + +It is functionally harmless to leave parent shard layers in remote storage indefinitely. +They would be cleaned up in the event of the tenant's deletion. + +As an optimization to avoid leaking remote storage capacity (which costs money), we may +lazily clean up parent shard layers once no child shards reference them. + +This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is: + +- list all the key prefixes beginning with the tenant ID, and select those shard prefixes + which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_) +- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and + may drop out now. +- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices. +- for all ancestral shards, list objects in the prefix and delete any layer which was not + referenced by a current shard. + +If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable. + +The cleanup may be done by the scrubber (external process), or we may choose to have +the zeroth shard in the latest generation do the work -- there is no obstacle to one shard +reading the other shard's indices at runtime, and we do not require visibility of the +latest index writes. + +Cleanup should be artificially delayed by some period (for example 24 hours) to ensure +that we retain the option to roll back a split in case of bugs. + +### Splitting secondary locations + +We may implement a pageserver API similar to the main splitting API, which does a simpler +operation for secondary locations: it would not write anything to S3, instead it would simply +create the child shard directory on local disk, hard link in directories from the parent, +and set up the in memory (TenantSlot) state for the children. + +Similar to attached locations, a subset of secondary locations will probably need re-locating +after the split is complete, to avoid leaving multiple child shards on the same pageservers, +where they may use excessive space for the tenant. + +## FAQ/Alternatives + +### What should the thresholds be set to? + +Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit. + +Max shard count: + +- The safekeeper overhead to sharding is currently O(N) network bandwidth because + the un-filtered WAL is sent to all shards. To avoid this growing out of control, + a limit of 8 shards should be temporarily imposed until WAL filtering is implemented + on the safekeeper. +- there is also little benefit to increasing the shard count beyond the number + of pageservers in a region. + +### Is it worth just rewriting all the data during a split to simplify reasoning about space? diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 12fa80349e..3732bfdab2 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab Neon storage broker, providing messaging between safekeepers and pageservers. [storage_broker.md](./storage_broker.md) +`storage_controller`: + +Neon storage controller, manages a cluster of pageservers and exposes an API that enables +managing a many-sharded tenant as a single entity. + `/control_plane`: Local control plane. diff --git a/docs/storage_controller.md b/docs/storage_controller.md new file mode 100644 index 0000000000..daf4d0c8b7 --- /dev/null +++ b/docs/storage_controller.md @@ -0,0 +1,150 @@ +# Storage Controller + +## Concepts + +The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller, +which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations). + +It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding +the underlying details of how data is spread across multiple nodes. + +The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent. + +## APIs + +The storage controller’s HTTP server implements four logically separate APIs: + +- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver. +- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits. +- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system. +- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers + to ensure data safety with generation numbers. + +The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs). + +See the `http.rs` file in the source for where the HTTP APIs are implemented. + +## Database + +The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not +persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and +rebuilt on startup. + +The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why. + +The `diesel` crate is used for defining models & migrations. + +Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database. + +### Diesel tip: migrations + +If you need to modify the database schema, here’s how to create a migration: + +- Install the diesel CLI with `cargo install diesel_cli` +- Use `diesel migration generate ` to create a new migration +- Populate the SQL files in the `migrations/` subdirectory +- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically. + - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service` +- Commit the migration files and the changes to schema.rs +- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again. +- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed. + +## storcon_cli + +The `storcon_cli` tool enables interactive management of the storage controller. This is usually +only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline). + +`storcon_cli --help` includes details on commands. + +# Deploying + +This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as +part of a self-hosted system. + +_General note: since the default `neon_local` environment includes a storage controller, this is a useful +reference when figuring out deployment._ + +## Database + +It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral +local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver. + +The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte. + +Set the URL to the database using the `--database-url` CLI option. + +There is no need to run migrations manually: the storage controller automatically applies migrations +when it starts up. + +## Configure pageservers to use the storage controller + +1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should + point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters. +2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself + with the storage controller when it starts up. See the example below for the format of this file. + +### Example `metadata.json` + +``` +{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000} +``` + +- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever + postgres runs. +- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where + the storage controller runs. + +## Handle compute notifications. + +The storage controller independently moves tenant attachments between pageservers in response to +changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable +postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver +location changes. + +The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires +JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request. + +In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems +the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling +the compute hook. + +When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated: +the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience. + +``` +struct ComputeHookNotifyRequestShard { + node_id: NodeId, + shard_number: ShardNumber, +} + +struct ComputeHookNotifyRequest { + tenant_id: TenantId, + stripe_size: Option, + shards: Vec, +} +``` + +When a notification is received: + +1. Modify postgres configuration for this tenant: + + - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The + shards identified by `NodeId` must be converted to the address+port of the node. + - if stripe_size is not None, set `neon.stripe_size` to this value + +2. Send SIGHUP to postgres to reload configuration +3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller + will retry the notification until it succeeds.. + +### Example notification body + +``` +{ + "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc", + "stripe_size": 32768, + "shards": [ + {"node_id": 344, "shard_number": 0}, + {"node_id": 722, "shard_number": 1}, + ], +} +``` diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md index 407d7b525a..3acb4e18cb 100644 --- a/docs/synthetic-size.md +++ b/docs/synthetic-size.md @@ -21,7 +21,7 @@ implementation where we keep more data than we would need to, do not change the synthetic size or incur any costs to the user. The synthetic size is calculated for the whole project. It is not -straighforward to attribute size to individual branches. See "What is +straightforward to attribute size to individual branches. See "What is the size of an individual branch?" for discussion on those difficulties. @@ -248,7 +248,7 @@ and truncate the WAL. Synthetic size is calculated for the whole project, and includes all branches. There is no such thing as the size of a branch, because it -is not straighforward to attribute the parts of size to individual +is not straightforward to attribute the parts of size to individual branches. ## Example: attributing size to branches diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 92bbf79cd4..d05d625b0a 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -3,7 +3,7 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize, Serializer}; -use crate::spec::ComputeSpec; +use crate::spec::{ComputeSpec, Database, Role}; #[derive(Serialize, Debug, Deserialize)] pub struct GenericAPIError { @@ -52,6 +52,10 @@ pub enum ComputeStatus { // compute will exit soon or is waiting for // control-plane to terminate it. Failed, + // Termination requested + TerminationPending, + // Terminated Postgres + Terminated, } fn rfc3339_serialize(x: &Option>, s: S) -> Result @@ -109,6 +113,12 @@ pub struct ComputeMetrics { pub total_ext_download_size: u64, } +#[derive(Clone, Debug, Default, Serialize)] +pub struct CatalogObjects { + pub roles: Vec, + pub databases: Vec, +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. /// This is not actually a compute API response, so consider moving /// to a different place. diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 4ff6831272..1c4ee2089f 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -33,6 +33,23 @@ pub struct ComputeSpec { #[serde(default)] pub features: Vec, + /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs + /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first + /// received. + /// + /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's + /// spec generation doesn't need to be aware of the actual compute it's running on, while + /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could + /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus + /// giving every VM much more swap than it should have (32GiB). + /// + /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for + /// enabling the swap resizing behavior once rollout is complete. + /// + /// See neondatabase/cloud#12047 for more. + #[serde(default)] + pub swap_size_bytes: Option, + /// Expected cluster state at the end of transition process. pub cluster: Cluster, pub delta_operations: Option>, @@ -75,6 +92,16 @@ pub struct ComputeSpec { pub remote_extensions: Option, pub pgbouncer_settings: Option>, + + // Stripe size for pageserver sharding, in pages + #[serde(default)] + pub shard_stripe_size: Option, + + // When we are starting a new replica in hot standby mode, + // we need to know if the primary is running. + // This is used to determine if replica should wait for + // RUNNING_XACTS from primary or not. + pub primary_is_running: Option, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -82,10 +109,16 @@ pub struct ComputeSpec { #[serde(rename_all = "snake_case")] pub enum ComputeFeature { // XXX: Add more feature flags here. + /// Enable the experimental activity monitor logic, which uses `pg_stat_database` to + /// track short-lived connections as user activity. + ActivityMonitorExperimental, - // This is a special feature flag that is used to represent unknown feature flags. - // Basically all unknown to enum flags are represented as this one. See unit test - // `parse_unknown_features()` for more details. + /// Pre-install and initialize anon extension for every database in the cluster + AnonExtension, + + /// This is a special feature flag that is used to represent unknown feature flags. + /// Basically all unknown to enum flags are represented as this one. See unit test + /// `parse_unknown_features()` for more details. #[serde(other)] UnknownFeature, } @@ -282,4 +315,23 @@ mod tests { assert!(spec.features.contains(&ComputeFeature::UnknownFeature)); assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]); } + + #[test] + fn parse_known_features() { + // Test that we can properly parse known feature flags. + let file = File::open("tests/cluster_spec.json").unwrap(); + let mut json: serde_json::Value = serde_json::from_reader(file).unwrap(); + let ob = json.as_object_mut().unwrap(); + + // Add known feature flags. + let features = vec!["activity_monitor_experimental"]; + ob.insert("features".into(), features.into()); + + let spec: ComputeSpec = serde_json::from_value(json).unwrap(); + + assert_eq!( + spec.features, + vec![ComputeFeature::ActivityMonitorExperimental] + ); + } } diff --git a/libs/desim/Cargo.toml b/libs/desim/Cargo.toml new file mode 100644 index 0000000000..6f442d8243 --- /dev/null +++ b/libs/desim/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "desim" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +rand.workspace = true +tracing.workspace = true +bytes.workspace = true +utils.workspace = true +parking_lot.workspace = true +hex.workspace = true +scopeguard.workspace = true +smallvec = { workspace = true, features = ["write"] } + +workspace_hack.workspace = true diff --git a/libs/desim/README.md b/libs/desim/README.md new file mode 100644 index 0000000000..80568ebb1b --- /dev/null +++ b/libs/desim/README.md @@ -0,0 +1,7 @@ +# Discrete Event SIMulator + +This is a library for running simulations of distributed systems. The main idea is borrowed from [FoundationDB](https://www.youtube.com/watch?v=4fFDFbi3toc). + +Each node runs as a separate thread. This library was not optimized for speed yet, but it's already much faster than running usual intergration tests in real time, because it uses virtual simulation time and can fast-forward time to skip intervals where all nodes are doing nothing but sleeping or waiting for something. + +The original purpose for this library is to test walproposer and safekeeper implementation working together, in a scenarios close to the real world environment. This simulator is determenistic and can inject failures in networking without waiting minutes of wall-time to trigger timeout, which makes it easier to find bugs in our consensus implementation compared to using integration tests. diff --git a/libs/desim/src/chan.rs b/libs/desim/src/chan.rs new file mode 100644 index 0000000000..6661d59871 --- /dev/null +++ b/libs/desim/src/chan.rs @@ -0,0 +1,108 @@ +use std::{collections::VecDeque, sync::Arc}; + +use parking_lot::{Mutex, MutexGuard}; + +use crate::executor::{self, PollSome, Waker}; + +/// FIFO channel with blocking send and receive. Can be cloned and shared between threads. +/// Blocking functions should be used only from threads that are managed by the executor. +pub struct Chan { + shared: Arc>, +} + +impl Clone for Chan { + fn clone(&self) -> Self { + Chan { + shared: self.shared.clone(), + } + } +} + +impl Default for Chan { + fn default() -> Self { + Self::new() + } +} + +impl Chan { + pub fn new() -> Chan { + Chan { + shared: Arc::new(State { + queue: Mutex::new(VecDeque::new()), + waker: Waker::new(), + }), + } + } + + /// Get a message from the front of the queue, block if the queue is empty. + /// If not called from the executor thread, it can block forever. + pub fn recv(&self) -> T { + self.shared.recv() + } + + /// Panic if the queue is empty. + pub fn must_recv(&self) -> T { + self.shared + .try_recv() + .expect("message should've been ready") + } + + /// Get a message from the front of the queue, return None if the queue is empty. + /// Never blocks. + pub fn try_recv(&self) -> Option { + self.shared.try_recv() + } + + /// Send a message to the back of the queue. + pub fn send(&self, t: T) { + self.shared.send(t); + } +} + +struct State { + queue: Mutex>, + waker: Waker, +} + +impl State { + fn send(&self, t: T) { + self.queue.lock().push_back(t); + self.waker.wake_all(); + } + + fn try_recv(&self) -> Option { + let mut q = self.queue.lock(); + q.pop_front() + } + + fn recv(&self) -> T { + // interrupt the receiver to prevent consuming everything at once + executor::yield_me(0); + + let mut queue = self.queue.lock(); + if let Some(t) = queue.pop_front() { + return t; + } + loop { + self.waker.wake_me_later(); + if let Some(t) = queue.pop_front() { + return t; + } + MutexGuard::unlocked(&mut queue, || { + executor::yield_me(-1); + }); + } + } +} + +impl PollSome for Chan { + /// Schedules a wakeup for the current thread. + fn wake_me(&self) { + self.shared.waker.wake_me_later(); + } + + /// Checks if chan has any pending messages. + fn has_some(&self) -> bool { + !self.shared.queue.lock().is_empty() + } +} diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs new file mode 100644 index 0000000000..9d44bd7741 --- /dev/null +++ b/libs/desim/src/executor.rs @@ -0,0 +1,483 @@ +use std::{ + panic::AssertUnwindSafe, + sync::{ + atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering}, + mpsc, Arc, OnceLock, + }, + thread::JoinHandle, +}; + +use tracing::{debug, error, trace}; + +use crate::time::Timing; + +/// Stores status of the running threads. Threads are registered in the runtime upon creation +/// and deregistered upon termination. +pub struct Runtime { + // stores handles to all threads that are currently running + threads: Vec, + // stores current time and pending wakeups + clock: Arc, + // thread counter + thread_counter: AtomicU32, + // Thread step counter -- how many times all threads has been actually + // stepped (note that all world/time/executor/thread have slightly different + // meaning of steps). For observability. + pub step_counter: u64, +} + +impl Runtime { + /// Init new runtime, no running threads. + pub fn new(clock: Arc) -> Self { + Self { + threads: Vec::new(), + clock, + thread_counter: AtomicU32::new(0), + step_counter: 0, + } + } + + /// Spawn a new thread and register it in the runtime. + pub fn spawn(&mut self, f: F) -> ExternalHandle + where + F: FnOnce() + Send + 'static, + { + let (tx, rx) = mpsc::channel(); + + let clock = self.clock.clone(); + let tid = self.thread_counter.fetch_add(1, Ordering::SeqCst); + debug!("spawning thread-{}", tid); + + let join = std::thread::spawn(move || { + let _guard = tracing::info_span!("", tid).entered(); + + let res = std::panic::catch_unwind(AssertUnwindSafe(|| { + with_thread_context(|ctx| { + assert!(ctx.clock.set(clock).is_ok()); + ctx.id.store(tid, Ordering::SeqCst); + tx.send(ctx.clone()).expect("failed to send thread context"); + // suspend thread to put it to `threads` in sleeping state + ctx.yield_me(0); + }); + + // start user-provided function + f(); + })); + debug!("thread finished"); + + if let Err(e) = res { + with_thread_context(|ctx| { + if !ctx.allow_panic.load(std::sync::atomic::Ordering::SeqCst) { + error!("thread panicked, terminating the process: {:?}", e); + std::process::exit(1); + } + + debug!("thread panicked: {:?}", e); + let mut result = ctx.result.lock(); + if result.0 == -1 { + *result = (256, format!("thread panicked: {:?}", e)); + } + }); + } + + with_thread_context(|ctx| { + ctx.finish_me(); + }); + }); + + let ctx = rx.recv().expect("failed to receive thread context"); + let handle = ThreadHandle::new(ctx.clone(), join); + + self.threads.push(handle); + + ExternalHandle { ctx } + } + + /// Returns true if there are any unfinished activity, such as running thread or pending events. + /// Otherwise returns false, which means all threads are blocked forever. + pub fn step(&mut self) -> bool { + trace!("runtime step"); + + // have we run any thread? + let mut ran = false; + + self.threads.retain(|thread: &ThreadHandle| { + let res = thread.ctx.wakeup.compare_exchange( + PENDING_WAKEUP, + NO_WAKEUP, + Ordering::SeqCst, + Ordering::SeqCst, + ); + if res.is_err() { + // thread has no pending wakeups, leaving as is + return true; + } + ran = true; + + trace!("entering thread-{}", thread.ctx.tid()); + let status = thread.step(); + self.step_counter += 1; + trace!( + "out of thread-{} with status {:?}", + thread.ctx.tid(), + status + ); + + if status == Status::Sleep { + true + } else { + trace!("thread has finished"); + // removing the thread from the list + false + } + }); + + if !ran { + trace!("no threads were run, stepping clock"); + if let Some(ctx_to_wake) = self.clock.step() { + trace!("waking up thread-{}", ctx_to_wake.tid()); + ctx_to_wake.inc_wake(); + } else { + return false; + } + } + + true + } + + /// Kill all threads. This is done by setting a flag in each thread context and waking it up. + pub fn crash_all_threads(&mut self) { + for thread in self.threads.iter() { + thread.ctx.crash_stop(); + } + + // all threads should be finished after a few steps + while !self.threads.is_empty() { + self.step(); + } + } +} + +impl Drop for Runtime { + fn drop(&mut self) { + debug!("dropping the runtime"); + self.crash_all_threads(); + } +} + +#[derive(Clone)] +pub struct ExternalHandle { + ctx: Arc, +} + +impl ExternalHandle { + /// Returns true if thread has finished execution. + pub fn is_finished(&self) -> bool { + let status = self.ctx.mutex.lock(); + *status == Status::Finished + } + + /// Returns exitcode and message, which is available after thread has finished execution. + pub fn result(&self) -> (i32, String) { + let result = self.ctx.result.lock(); + result.clone() + } + + /// Returns thread id. + pub fn id(&self) -> u32 { + self.ctx.id.load(Ordering::SeqCst) + } + + /// Sets a flag to crash thread on the next wakeup. + pub fn crash_stop(&self) { + self.ctx.crash_stop(); + } +} + +struct ThreadHandle { + ctx: Arc, + _join: JoinHandle<()>, +} + +impl ThreadHandle { + /// Create a new [`ThreadHandle`] and wait until thread will enter [`Status::Sleep`] state. + fn new(ctx: Arc, join: JoinHandle<()>) -> Self { + let mut status = ctx.mutex.lock(); + // wait until thread will go into the first yield + while *status != Status::Sleep { + ctx.condvar.wait(&mut status); + } + drop(status); + + Self { ctx, _join: join } + } + + /// Allows thread to execute one step of its execution. + /// Returns [`Status`] of the thread after the step. + fn step(&self) -> Status { + let mut status = self.ctx.mutex.lock(); + assert!(matches!(*status, Status::Sleep)); + + *status = Status::Running; + self.ctx.condvar.notify_all(); + + while *status == Status::Running { + self.ctx.condvar.wait(&mut status); + } + + *status + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum Status { + /// Thread is running. + Running, + /// Waiting for event to complete, will be resumed by the executor step, once wakeup flag is set. + Sleep, + /// Thread finished execution. + Finished, +} + +const NO_WAKEUP: u8 = 0; +const PENDING_WAKEUP: u8 = 1; + +pub struct ThreadContext { + id: AtomicU32, + // used to block thread until it is woken up + mutex: parking_lot::Mutex, + condvar: parking_lot::Condvar, + // used as a flag to indicate runtime that thread is ready to be woken up + wakeup: AtomicU8, + clock: OnceLock>, + // execution result, set by exit() call + result: parking_lot::Mutex<(i32, String)>, + // determines if process should be killed on receiving panic + allow_panic: AtomicBool, + // acts as a signal that thread should crash itself on the next wakeup + crash_request: AtomicBool, +} + +impl ThreadContext { + pub(crate) fn new() -> Self { + Self { + id: AtomicU32::new(0), + mutex: parking_lot::Mutex::new(Status::Running), + condvar: parking_lot::Condvar::new(), + wakeup: AtomicU8::new(NO_WAKEUP), + clock: OnceLock::new(), + result: parking_lot::Mutex::new((-1, String::new())), + allow_panic: AtomicBool::new(false), + crash_request: AtomicBool::new(false), + } + } +} + +// Functions for executor to control thread execution. +impl ThreadContext { + /// Set atomic flag to indicate that thread is ready to be woken up. + fn inc_wake(&self) { + self.wakeup.store(PENDING_WAKEUP, Ordering::SeqCst); + } + + /// Internal function used for event queues. + pub(crate) fn schedule_wakeup(self: &Arc, after_ms: u64) { + self.clock + .get() + .unwrap() + .schedule_wakeup(after_ms, self.clone()); + } + + fn tid(&self) -> u32 { + self.id.load(Ordering::SeqCst) + } + + fn crash_stop(&self) { + let status = self.mutex.lock(); + if *status == Status::Finished { + debug!( + "trying to crash thread-{}, which is already finished", + self.tid() + ); + return; + } + assert!(matches!(*status, Status::Sleep)); + drop(status); + + self.allow_panic.store(true, Ordering::SeqCst); + self.crash_request.store(true, Ordering::SeqCst); + // set a wakeup + self.inc_wake(); + // it will panic on the next wakeup + } +} + +// Internal functions. +impl ThreadContext { + /// Blocks thread until it's woken up by the executor. If `after_ms` is 0, is will be + /// woken on the next step. If `after_ms` > 0, wakeup is scheduled after that time. + /// Otherwise wakeup is not scheduled inside `yield_me`, and should be arranged before + /// calling this function. + fn yield_me(self: &Arc, after_ms: i64) { + let mut status = self.mutex.lock(); + assert!(matches!(*status, Status::Running)); + + match after_ms.cmp(&0) { + std::cmp::Ordering::Less => { + // block until something wakes us up + } + std::cmp::Ordering::Equal => { + // tell executor that we are ready to be woken up + self.inc_wake(); + } + std::cmp::Ordering::Greater => { + // schedule wakeup + self.clock + .get() + .unwrap() + .schedule_wakeup(after_ms as u64, self.clone()); + } + } + + *status = Status::Sleep; + self.condvar.notify_all(); + + // wait until executor wakes us up + while *status != Status::Running { + self.condvar.wait(&mut status); + } + + if self.crash_request.load(Ordering::SeqCst) { + panic!("crashed by request"); + } + } + + /// Called only once, exactly before thread finishes execution. + fn finish_me(&self) { + let mut status = self.mutex.lock(); + assert!(matches!(*status, Status::Running)); + + *status = Status::Finished; + { + let mut result = self.result.lock(); + if result.0 == -1 { + *result = (0, "finished normally".to_owned()); + } + } + self.condvar.notify_all(); + } +} + +/// Invokes the given closure with a reference to the current thread [`ThreadContext`]. +#[inline(always)] +fn with_thread_context(f: impl FnOnce(&Arc) -> T) -> T { + thread_local!(static THREAD_DATA: Arc = Arc::new(ThreadContext::new())); + THREAD_DATA.with(f) +} + +/// Waker is used to wake up threads that are blocked on condition. +/// It keeps track of contexts [`Arc`] and can increment the counter +/// of several contexts to send a notification. +pub struct Waker { + // contexts that are waiting for a notification + contexts: parking_lot::Mutex; 8]>>, +} + +impl Default for Waker { + fn default() -> Self { + Self::new() + } +} + +impl Waker { + pub fn new() -> Self { + Self { + contexts: parking_lot::Mutex::new(smallvec::SmallVec::new()), + } + } + + /// Subscribe current thread to receive a wake notification later. + pub fn wake_me_later(&self) { + with_thread_context(|ctx| { + self.contexts.lock().push(ctx.clone()); + }); + } + + /// Wake up all threads that are waiting for a notification and clear the list. + pub fn wake_all(&self) { + let mut v = self.contexts.lock(); + for ctx in v.iter() { + ctx.inc_wake(); + } + v.clear(); + } +} + +/// See [`ThreadContext::yield_me`]. +pub fn yield_me(after_ms: i64) { + with_thread_context(|ctx| ctx.yield_me(after_ms)) +} + +/// Get current time. +pub fn now() -> u64 { + with_thread_context(|ctx| ctx.clock.get().unwrap().now()) +} + +pub fn exit(code: i32, msg: String) { + with_thread_context(|ctx| { + ctx.allow_panic.store(true, Ordering::SeqCst); + let mut result = ctx.result.lock(); + *result = (code, msg); + panic!("exit"); + }); +} + +pub(crate) fn get_thread_ctx() -> Arc { + with_thread_context(|ctx| ctx.clone()) +} + +/// Trait for polling channels until they have something. +pub trait PollSome { + /// Schedule wakeup for message arrival. + fn wake_me(&self); + + /// Check if channel has a ready message. + fn has_some(&self) -> bool; +} + +/// Blocks current thread until one of the channels has a ready message. Returns +/// index of the channel that has a message. If timeout is reached, returns None. +/// +/// Negative timeout means block forever. Zero timeout means check channels and return +/// immediately. Positive timeout means block until timeout is reached. +pub fn epoll_chans(chans: &[Box], timeout: i64) -> Option { + let deadline = if timeout < 0 { + 0 + } else { + now() + timeout as u64 + }; + + loop { + for chan in chans { + chan.wake_me() + } + + for (i, chan) in chans.iter().enumerate() { + if chan.has_some() { + return Some(i); + } + } + + if timeout < 0 { + // block until wakeup + yield_me(-1); + } else { + let current_time = now(); + if current_time >= deadline { + return None; + } + + yield_me((deadline - current_time) as i64); + } + } +} diff --git a/libs/desim/src/lib.rs b/libs/desim/src/lib.rs new file mode 100644 index 0000000000..14f5a885c5 --- /dev/null +++ b/libs/desim/src/lib.rs @@ -0,0 +1,8 @@ +pub mod chan; +pub mod executor; +pub mod network; +pub mod node_os; +pub mod options; +pub mod proto; +pub mod time; +pub mod world; diff --git a/libs/desim/src/network.rs b/libs/desim/src/network.rs new file mode 100644 index 0000000000..e15a714daa --- /dev/null +++ b/libs/desim/src/network.rs @@ -0,0 +1,451 @@ +use std::{ + cmp::Ordering, + collections::{BinaryHeap, VecDeque}, + fmt::{self, Debug}, + ops::DerefMut, + sync::{mpsc, Arc}, +}; + +use parking_lot::{ + lock_api::{MappedMutexGuard, MutexGuard}, + Mutex, RawMutex, +}; +use rand::rngs::StdRng; +use tracing::debug; + +use crate::{ + executor::{self, ThreadContext}, + options::NetworkOptions, + proto::NetEvent, + proto::NodeEvent, +}; + +use super::{chan::Chan, proto::AnyMessage}; + +pub struct NetworkTask { + options: Arc, + connections: Mutex>, + /// min-heap of connections having something to deliver. + events: Mutex>, + task_context: Arc, +} + +impl NetworkTask { + pub fn start_new(options: Arc, tx: mpsc::Sender>) { + let ctx = executor::get_thread_ctx(); + let task = Arc::new(Self { + options, + connections: Mutex::new(Vec::new()), + events: Mutex::new(BinaryHeap::new()), + task_context: ctx, + }); + + // send the task upstream + tx.send(task.clone()).unwrap(); + + // start the task + task.start(); + } + + pub fn start_new_connection(self: &Arc, rng: StdRng, dst_accept: Chan) -> TCP { + let now = executor::now(); + let connection_id = self.connections.lock().len(); + + let vc = VirtualConnection { + connection_id, + dst_accept, + dst_sockets: [Chan::new(), Chan::new()], + state: Mutex::new(ConnectionState { + buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))], + rng, + }), + }; + vc.schedule_timeout(self); + vc.send_connect(self); + + let recv_chan = vc.dst_sockets[0].clone(); + self.connections.lock().push(vc); + + TCP { + net: self.clone(), + conn_id: connection_id, + dir: 0, + recv_chan, + } + } +} + +// private functions +impl NetworkTask { + /// Schedule to wakeup network task (self) `after_ms` later to deliver + /// messages of connection `id`. + fn schedule(&self, id: usize, after_ms: u64) { + self.events.lock().push(Event { + time: executor::now() + after_ms, + conn_id: id, + }); + self.task_context.schedule_wakeup(after_ms); + } + + /// Get locked connection `id`. + fn get(&self, id: usize) -> MappedMutexGuard<'_, RawMutex, VirtualConnection> { + MutexGuard::map(self.connections.lock(), |connections| { + connections.get_mut(id).unwrap() + }) + } + + fn collect_pending_events(&self, now: u64, vec: &mut Vec) { + vec.clear(); + let mut events = self.events.lock(); + while let Some(event) = events.peek() { + if event.time > now { + break; + } + let event = events.pop().unwrap(); + vec.push(event); + } + } + + fn start(self: &Arc) { + debug!("started network task"); + + let mut events = Vec::new(); + loop { + let now = executor::now(); + self.collect_pending_events(now, &mut events); + + for event in events.drain(..) { + let conn = self.get(event.conn_id); + conn.process(self); + } + + // block until wakeup + executor::yield_me(-1); + } + } +} + +// 0 - from node(0) to node(1) +// 1 - from node(1) to node(0) +type MessageDirection = u8; + +fn sender_str(dir: MessageDirection) -> &'static str { + match dir { + 0 => "client", + 1 => "server", + _ => unreachable!(), + } +} + +fn receiver_str(dir: MessageDirection) -> &'static str { + match dir { + 0 => "server", + 1 => "client", + _ => unreachable!(), + } +} + +/// Virtual connection between two nodes. +/// Node 0 is the creator of the connection (client), +/// and node 1 is the acceptor (server). +struct VirtualConnection { + connection_id: usize, + /// one-off chan, used to deliver Accept message to dst + dst_accept: Chan, + /// message sinks + dst_sockets: [Chan; 2], + state: Mutex, +} + +struct ConnectionState { + buffers: [NetworkBuffer; 2], + rng: StdRng, +} + +impl VirtualConnection { + /// Notify the future about the possible timeout. + fn schedule_timeout(&self, net: &NetworkTask) { + if let Some(timeout) = net.options.keepalive_timeout { + net.schedule(self.connection_id, timeout); + } + } + + /// Send the handshake (Accept) to the server. + fn send_connect(&self, net: &NetworkTask) { + let now = executor::now(); + let mut state = self.state.lock(); + let delay = net.options.connect_delay.delay(&mut state.rng); + let buffer = &mut state.buffers[0]; + assert!(buffer.buf.is_empty()); + assert!(!buffer.recv_closed); + assert!(!buffer.send_closed); + assert!(buffer.last_recv.is_none()); + + let delay = if let Some(ms) = delay { + ms + } else { + debug!("NET: TCP #{} dropped connect", self.connection_id); + buffer.send_closed = true; + return; + }; + + // Send a message into the future. + buffer + .buf + .push_back((now + delay, AnyMessage::InternalConnect)); + net.schedule(self.connection_id, delay); + } + + /// Transmit some of the messages from the buffer to the nodes. + fn process(&self, net: &Arc) { + let now = executor::now(); + + let mut state = self.state.lock(); + + for direction in 0..2 { + self.process_direction( + net, + state.deref_mut(), + now, + direction as MessageDirection, + &self.dst_sockets[direction ^ 1], + ); + } + + // Close the one side of the connection by timeout if the node + // has not received any messages for a long time. + if let Some(timeout) = net.options.keepalive_timeout { + let mut to_close = [false, false]; + for direction in 0..2 { + let buffer = &mut state.buffers[direction]; + if buffer.recv_closed { + continue; + } + if let Some(last_recv) = buffer.last_recv { + if now - last_recv >= timeout { + debug!( + "NET: connection {} timed out at {}", + self.connection_id, + receiver_str(direction as MessageDirection) + ); + let node_idx = direction ^ 1; + to_close[node_idx] = true; + } + } + } + drop(state); + + for (node_idx, should_close) in to_close.iter().enumerate() { + if *should_close { + self.close(node_idx); + } + } + } + } + + /// Process messages in the buffer in the given direction. + fn process_direction( + &self, + net: &Arc, + state: &mut ConnectionState, + now: u64, + direction: MessageDirection, + to_socket: &Chan, + ) { + let buffer = &mut state.buffers[direction as usize]; + if buffer.recv_closed { + assert!(buffer.buf.is_empty()); + } + + while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now { + let msg = buffer.buf.pop_front().unwrap().1; + + buffer.last_recv = Some(now); + self.schedule_timeout(net); + + if let AnyMessage::InternalConnect = msg { + // TODO: assert to_socket is the server + let server_to_client = TCP { + net: net.clone(), + conn_id: self.connection_id, + dir: direction ^ 1, + recv_chan: to_socket.clone(), + }; + // special case, we need to deliver new connection to a separate channel + self.dst_accept.send(NodeEvent::Accept(server_to_client)); + } else { + to_socket.send(NetEvent::Message(msg)); + } + } + } + + /// Try to send a message to the buffer, optionally dropping it and + /// determining delivery timestamp. + fn send(&self, net: &NetworkTask, direction: MessageDirection, msg: AnyMessage) { + let now = executor::now(); + let mut state = self.state.lock(); + + let (delay, close) = if let Some(ms) = net.options.send_delay.delay(&mut state.rng) { + (ms, false) + } else { + (0, true) + }; + + let buffer = &mut state.buffers[direction as usize]; + if buffer.send_closed { + debug!( + "NET: TCP #{} dropped message {:?} (broken pipe)", + self.connection_id, msg + ); + return; + } + + if close { + debug!( + "NET: TCP #{} dropped message {:?} (pipe just broke)", + self.connection_id, msg + ); + buffer.send_closed = true; + return; + } + + if buffer.recv_closed { + debug!( + "NET: TCP #{} dropped message {:?} (recv closed)", + self.connection_id, msg + ); + return; + } + + // Send a message into the future. + buffer.buf.push_back((now + delay, msg)); + net.schedule(self.connection_id, delay); + } + + /// Close the connection. Only one side of the connection will be closed, + /// and no further messages will be delivered. The other side will not be notified. + fn close(&self, node_idx: usize) { + let mut state = self.state.lock(); + let recv_buffer = &mut state.buffers[1 ^ node_idx]; + if recv_buffer.recv_closed { + debug!( + "NET: TCP #{} closed twice at {}", + self.connection_id, + sender_str(node_idx as MessageDirection), + ); + return; + } + + debug!( + "NET: TCP #{} closed at {}", + self.connection_id, + sender_str(node_idx as MessageDirection), + ); + recv_buffer.recv_closed = true; + for msg in recv_buffer.buf.drain(..) { + debug!( + "NET: TCP #{} dropped message {:?} (closed)", + self.connection_id, msg + ); + } + + let send_buffer = &mut state.buffers[node_idx]; + send_buffer.send_closed = true; + drop(state); + + // TODO: notify the other side? + + self.dst_sockets[node_idx].send(NetEvent::Closed); + } +} + +struct NetworkBuffer { + /// Messages paired with time of delivery + buf: VecDeque<(u64, AnyMessage)>, + /// True if the connection is closed on the receiving side, + /// i.e. no more messages from the buffer will be delivered. + recv_closed: bool, + /// True if the connection is closed on the sending side, + /// i.e. no more messages will be added to the buffer. + send_closed: bool, + /// Last time a message was delivered from the buffer. + /// If None, it means that the server is the receiver and + /// it has not yet aware of this connection (i.e. has not + /// received the Accept). + last_recv: Option, +} + +impl NetworkBuffer { + fn new(last_recv: Option) -> Self { + Self { + buf: VecDeque::new(), + recv_closed: false, + send_closed: false, + last_recv, + } + } +} + +/// Single end of a bidirectional network stream without reordering (TCP-like). +/// Reads are implemented using channels, writes go to the buffer inside VirtualConnection. +pub struct TCP { + net: Arc, + conn_id: usize, + dir: MessageDirection, + recv_chan: Chan, +} + +impl Debug for TCP { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "TCP #{} ({})", self.conn_id, sender_str(self.dir),) + } +} + +impl TCP { + /// Send a message to the other side. It's guaranteed that it will not arrive + /// before the arrival of all messages sent earlier. + pub fn send(&self, msg: AnyMessage) { + let conn = self.net.get(self.conn_id); + conn.send(&self.net, self.dir, msg); + } + + /// Get a channel to receive incoming messages. + pub fn recv_chan(&self) -> Chan { + self.recv_chan.clone() + } + + pub fn connection_id(&self) -> usize { + self.conn_id + } + + pub fn close(&self) { + let conn = self.net.get(self.conn_id); + conn.close(self.dir as usize); + } +} +struct Event { + time: u64, + conn_id: usize, +} + +// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here +// to get that. +impl PartialOrd for Event { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Event { + fn cmp(&self, other: &Self) -> Ordering { + (other.time, other.conn_id).cmp(&(self.time, self.conn_id)) + } +} + +impl PartialEq for Event { + fn eq(&self, other: &Self) -> bool { + (other.time, other.conn_id) == (self.time, self.conn_id) + } +} + +impl Eq for Event {} diff --git a/libs/desim/src/node_os.rs b/libs/desim/src/node_os.rs new file mode 100644 index 0000000000..7744a9f5e1 --- /dev/null +++ b/libs/desim/src/node_os.rs @@ -0,0 +1,54 @@ +use std::sync::Arc; + +use rand::Rng; + +use crate::proto::NodeEvent; + +use super::{ + chan::Chan, + network::TCP, + world::{Node, NodeId, World}, +}; + +/// Abstraction with all functions (aka syscalls) available to the node. +#[derive(Clone)] +pub struct NodeOs { + world: Arc, + internal: Arc, +} + +impl NodeOs { + pub fn new(world: Arc, internal: Arc) -> NodeOs { + NodeOs { world, internal } + } + + /// Get the node id. + pub fn id(&self) -> NodeId { + self.internal.id + } + + /// Opens a bidirectional connection with the other node. Always successful. + pub fn open_tcp(&self, dst: NodeId) -> TCP { + self.world.open_tcp(dst) + } + + /// Returns a channel to receive node events (socket Accept and internal messages). + pub fn node_events(&self) -> Chan { + self.internal.node_events() + } + + /// Get current time. + pub fn now(&self) -> u64 { + self.world.now() + } + + /// Generate a random number in range [0, max). + pub fn random(&self, max: u64) -> u64 { + self.internal.rng.lock().gen_range(0..max) + } + + /// Append a new event to the world event log. + pub fn log_event(&self, data: String) { + self.internal.log_event(data) + } +} diff --git a/libs/desim/src/options.rs b/libs/desim/src/options.rs new file mode 100644 index 0000000000..5da7c2c482 --- /dev/null +++ b/libs/desim/src/options.rs @@ -0,0 +1,50 @@ +use rand::{rngs::StdRng, Rng}; + +/// Describes random delays and failures. Delay will be uniformly distributed in [min, max]. +/// Connection failure will occur with the probablity fail_prob. +#[derive(Clone, Debug)] +pub struct Delay { + pub min: u64, + pub max: u64, + pub fail_prob: f64, // [0; 1] +} + +impl Delay { + /// Create a struct with no delay, no failures. + pub fn empty() -> Delay { + Delay { + min: 0, + max: 0, + fail_prob: 0.0, + } + } + + /// Create a struct with a fixed delay. + pub fn fixed(ms: u64) -> Delay { + Delay { + min: ms, + max: ms, + fail_prob: 0.0, + } + } + + /// Generate a random delay in range [min, max]. Return None if the + /// message should be dropped. + pub fn delay(&self, rng: &mut StdRng) -> Option { + if rng.gen_bool(self.fail_prob) { + return None; + } + Some(rng.gen_range(self.min..=self.max)) + } +} + +/// Describes network settings. All network packets will be subjected to the same delays and failures. +#[derive(Clone, Debug)] +pub struct NetworkOptions { + /// Connection will be automatically closed after this timeout if no data is received. + pub keepalive_timeout: Option, + /// New connections will be delayed by this amount of time. + pub connect_delay: Delay, + /// Each message will be delayed by this amount of time. + pub send_delay: Delay, +} diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs new file mode 100644 index 0000000000..92a7e8a27d --- /dev/null +++ b/libs/desim/src/proto.rs @@ -0,0 +1,63 @@ +use std::fmt::Debug; + +use bytes::Bytes; +use utils::lsn::Lsn; + +use crate::{network::TCP, world::NodeId}; + +/// Internal node events. +#[derive(Debug)] +pub enum NodeEvent { + Accept(TCP), + Internal(AnyMessage), +} + +/// Events that are coming from a network socket. +#[derive(Clone, Debug)] +pub enum NetEvent { + Message(AnyMessage), + Closed, +} + +/// Custom events generated throughout the simulation. Can be used by the test to verify the correctness. +#[derive(Debug)] +pub struct SimEvent { + pub time: u64, + pub node: NodeId, + pub data: String, +} + +/// Umbrella type for all possible flavours of messages. These events can be sent over network +/// or to an internal node events channel. +#[derive(Clone)] +pub enum AnyMessage { + /// Not used, empty placeholder. + None, + /// Used internally for notifying node about new incoming connection. + InternalConnect, + Just32(u32), + ReplCell(ReplCell), + Bytes(Bytes), + LSN(u64), +} + +impl Debug for AnyMessage { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + AnyMessage::None => write!(f, "None"), + AnyMessage::InternalConnect => write!(f, "InternalConnect"), + AnyMessage::Just32(v) => write!(f, "Just32({})", v), + AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v), + AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)), + AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)), + } + } +} + +/// Used in reliable_copy_test.rs +#[derive(Clone, Debug)] +pub struct ReplCell { + pub value: u32, + pub client_id: u32, + pub seqno: u32, +} diff --git a/libs/desim/src/time.rs b/libs/desim/src/time.rs new file mode 100644 index 0000000000..7bb71db95c --- /dev/null +++ b/libs/desim/src/time.rs @@ -0,0 +1,129 @@ +use std::{ + cmp::Ordering, + collections::BinaryHeap, + ops::DerefMut, + sync::{ + atomic::{AtomicU32, AtomicU64}, + Arc, + }, +}; + +use parking_lot::Mutex; +use tracing::trace; + +use crate::executor::ThreadContext; + +/// Holds current time and all pending wakeup events. +pub struct Timing { + /// Current world's time. + current_time: AtomicU64, + /// Pending timers. + queue: Mutex>, + /// Global nonce. Makes picking events from binary heap queue deterministic + /// by appending a number to events with the same timestamp. + nonce: AtomicU32, + /// Used to schedule fake events. + fake_context: Arc, +} + +impl Default for Timing { + fn default() -> Self { + Self::new() + } +} + +impl Timing { + /// Create a new empty clock with time set to 0. + pub fn new() -> Timing { + Timing { + current_time: AtomicU64::new(0), + queue: Mutex::new(BinaryHeap::new()), + nonce: AtomicU32::new(0), + fake_context: Arc::new(ThreadContext::new()), + } + } + + /// Return the current world's time. + pub fn now(&self) -> u64 { + self.current_time.load(std::sync::atomic::Ordering::SeqCst) + } + + /// Tick-tock the global clock. Return the event ready to be processed + /// or move the clock forward and then return the event. + pub(crate) fn step(&self) -> Option> { + let mut queue = self.queue.lock(); + + if queue.is_empty() { + // no future events + return None; + } + + if !self.is_event_ready(queue.deref_mut()) { + let next_time = queue.peek().unwrap().time; + self.current_time + .store(next_time, std::sync::atomic::Ordering::SeqCst); + trace!("rewind time to {}", next_time); + assert!(self.is_event_ready(queue.deref_mut())); + } + + Some(queue.pop().unwrap().wake_context) + } + + /// Append an event to the queue, to wakeup the thread in `ms` milliseconds. + pub(crate) fn schedule_wakeup(&self, ms: u64, wake_context: Arc) { + self.nonce.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + let nonce = self.nonce.load(std::sync::atomic::Ordering::SeqCst); + self.queue.lock().push(Pending { + time: self.now() + ms, + nonce, + wake_context, + }) + } + + /// Append a fake event to the queue, to prevent clocks from skipping this time. + pub fn schedule_fake(&self, ms: u64) { + self.queue.lock().push(Pending { + time: self.now() + ms, + nonce: 0, + wake_context: self.fake_context.clone(), + }); + } + + /// Return true if there is a ready event. + fn is_event_ready(&self, queue: &mut BinaryHeap) -> bool { + queue.peek().map_or(false, |x| x.time <= self.now()) + } + + /// Clear all pending events. + pub(crate) fn clear(&self) { + self.queue.lock().clear(); + } +} + +struct Pending { + time: u64, + nonce: u32, + wake_context: Arc, +} + +// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here +// to get that. +impl PartialOrd for Pending { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Pending { + fn cmp(&self, other: &Self) -> Ordering { + (other.time, other.nonce).cmp(&(self.time, self.nonce)) + } +} + +impl PartialEq for Pending { + fn eq(&self, other: &Self) -> bool { + (other.time, other.nonce) == (self.time, self.nonce) + } +} + +impl Eq for Pending {} diff --git a/libs/desim/src/world.rs b/libs/desim/src/world.rs new file mode 100644 index 0000000000..7d60be04b5 --- /dev/null +++ b/libs/desim/src/world.rs @@ -0,0 +1,180 @@ +use parking_lot::Mutex; +use rand::{rngs::StdRng, SeedableRng}; +use std::{ + ops::DerefMut, + sync::{mpsc, Arc}, +}; + +use crate::{ + executor::{ExternalHandle, Runtime}, + network::NetworkTask, + options::NetworkOptions, + proto::{NodeEvent, SimEvent}, + time::Timing, +}; + +use super::{chan::Chan, network::TCP, node_os::NodeOs}; + +pub type NodeId = u32; + +/// World contains simulation state. +pub struct World { + nodes: Mutex>>, + /// Random number generator. + rng: Mutex, + /// Internal event log. + events: Mutex>, + /// Separate task that processes all network messages. + network_task: Arc, + /// Runtime for running threads and moving time. + runtime: Mutex, + /// To get current time. + timing: Arc, +} + +impl World { + pub fn new(seed: u64, options: Arc) -> World { + let timing = Arc::new(Timing::new()); + let mut runtime = Runtime::new(timing.clone()); + + let (tx, rx) = mpsc::channel(); + + runtime.spawn(move || { + // create and start network background thread, and send it back via the channel + NetworkTask::start_new(options, tx) + }); + + // wait for the network task to start + while runtime.step() {} + + let network_task = rx.recv().unwrap(); + + World { + nodes: Mutex::new(Vec::new()), + rng: Mutex::new(StdRng::seed_from_u64(seed)), + events: Mutex::new(Vec::new()), + network_task, + runtime: Mutex::new(runtime), + timing, + } + } + + pub fn step(&self) -> bool { + self.runtime.lock().step() + } + + pub fn get_thread_step_count(&self) -> u64 { + self.runtime.lock().step_counter + } + + /// Create a new random number generator. + pub fn new_rng(&self) -> StdRng { + let mut rng = self.rng.lock(); + StdRng::from_rng(rng.deref_mut()).unwrap() + } + + /// Create a new node. + pub fn new_node(self: &Arc) -> Arc { + let mut nodes = self.nodes.lock(); + let id = nodes.len() as NodeId; + let node = Arc::new(Node::new(id, self.clone(), self.new_rng())); + nodes.push(node.clone()); + node + } + + /// Get an internal node state by id. + fn get_node(&self, id: NodeId) -> Option> { + let nodes = self.nodes.lock(); + let num = id as usize; + if num < nodes.len() { + Some(nodes[num].clone()) + } else { + None + } + } + + pub fn stop_all(&self) { + self.runtime.lock().crash_all_threads(); + } + + /// Returns a writable end of a TCP connection, to send src->dst messages. + pub fn open_tcp(self: &Arc, dst: NodeId) -> TCP { + // TODO: replace unwrap() with /dev/null socket. + let dst = self.get_node(dst).unwrap(); + let dst_accept = dst.node_events.lock().clone(); + + let rng = self.new_rng(); + self.network_task.start_new_connection(rng, dst_accept) + } + + /// Get current time. + pub fn now(&self) -> u64 { + self.timing.now() + } + + /// Get a copy of the internal clock. + pub fn clock(&self) -> Arc { + self.timing.clone() + } + + pub fn add_event(&self, node: NodeId, data: String) { + let time = self.now(); + self.events.lock().push(SimEvent { time, node, data }); + } + + pub fn take_events(&self) -> Vec { + let mut events = self.events.lock(); + let mut res = Vec::new(); + std::mem::swap(&mut res, &mut events); + res + } + + pub fn deallocate(&self) { + self.stop_all(); + self.timing.clear(); + self.nodes.lock().clear(); + } +} + +/// Internal node state. +pub struct Node { + pub id: NodeId, + node_events: Mutex>, + world: Arc, + pub(crate) rng: Mutex, +} + +impl Node { + pub fn new(id: NodeId, world: Arc, rng: StdRng) -> Node { + Node { + id, + node_events: Mutex::new(Chan::new()), + world, + rng: Mutex::new(rng), + } + } + + /// Spawn a new thread with this node context. + pub fn launch(self: &Arc, f: impl FnOnce(NodeOs) + Send + 'static) -> ExternalHandle { + let node = self.clone(); + let world = self.world.clone(); + self.world.runtime.lock().spawn(move || { + f(NodeOs::new(world, node.clone())); + }) + } + + /// Returns a channel to receive Accepts and internal messages. + pub fn node_events(&self) -> Chan { + self.node_events.lock().clone() + } + + /// This will drop all in-flight Accept messages. + pub fn replug_node_events(&self, chan: Chan) { + *self.node_events.lock() = chan; + } + + /// Append event to the world's log. + pub fn log_event(&self, data: String) { + self.world.add_event(self.id, data) + } +} diff --git a/libs/desim/tests/reliable_copy_test.rs b/libs/desim/tests/reliable_copy_test.rs new file mode 100644 index 0000000000..cf7bff8f5a --- /dev/null +++ b/libs/desim/tests/reliable_copy_test.rs @@ -0,0 +1,244 @@ +//! Simple test to verify that simulator is working. +#[cfg(test)] +mod reliable_copy_test { + use anyhow::Result; + use desim::executor::{self, PollSome}; + use desim::options::{Delay, NetworkOptions}; + use desim::proto::{NetEvent, NodeEvent, ReplCell}; + use desim::world::{NodeId, World}; + use desim::{node_os::NodeOs, proto::AnyMessage}; + use parking_lot::Mutex; + use std::sync::Arc; + use tracing::info; + + /// Disk storage trait and implementation. + pub trait Storage { + fn flush_pos(&self) -> u32; + fn flush(&mut self) -> Result<()>; + fn write(&mut self, t: T); + } + + #[derive(Clone)] + pub struct SharedStorage { + pub state: Arc>>, + } + + impl SharedStorage { + pub fn new() -> Self { + Self { + state: Arc::new(Mutex::new(InMemoryStorage::new())), + } + } + } + + impl Storage for SharedStorage { + fn flush_pos(&self) -> u32 { + self.state.lock().flush_pos + } + + fn flush(&mut self) -> Result<()> { + executor::yield_me(0); + self.state.lock().flush() + } + + fn write(&mut self, t: T) { + executor::yield_me(0); + self.state.lock().write(t); + } + } + + pub struct InMemoryStorage { + pub data: Vec, + pub flush_pos: u32, + } + + impl InMemoryStorage { + pub fn new() -> Self { + Self { + data: Vec::new(), + flush_pos: 0, + } + } + + pub fn flush(&mut self) -> Result<()> { + self.flush_pos = self.data.len() as u32; + Ok(()) + } + + pub fn write(&mut self, t: T) { + self.data.push(t); + } + } + + /// Server implementation. + pub fn run_server(os: NodeOs, mut storage: Box>) { + info!("started server"); + + let node_events = os.node_events(); + let mut epoll_vec: Vec> = vec![Box::new(node_events.clone())]; + let mut sockets = vec![]; + + loop { + let index = executor::epoll_chans(&epoll_vec, -1).unwrap(); + + if index == 0 { + let node_event = node_events.must_recv(); + info!("got node event: {:?}", node_event); + if let NodeEvent::Accept(tcp) = node_event { + tcp.send(AnyMessage::Just32(storage.flush_pos())); + epoll_vec.push(Box::new(tcp.recv_chan())); + sockets.push(tcp); + } + continue; + } + + let recv_chan = sockets[index - 1].recv_chan(); + let socket = &sockets[index - 1]; + + let event = recv_chan.must_recv(); + info!("got event: {:?}", event); + if let NetEvent::Message(AnyMessage::ReplCell(cell)) = event { + if cell.seqno != storage.flush_pos() { + info!("got out of order data: {:?}", cell); + continue; + } + storage.write(cell.value); + storage.flush().unwrap(); + socket.send(AnyMessage::Just32(storage.flush_pos())); + } + } + } + + /// Client copies all data from array to the remote node. + pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) { + info!("started client"); + + let mut delivered = 0; + + let mut sock = os.open_tcp(dst); + let mut recv_chan = sock.recv_chan(); + + while delivered < data.len() { + let num = &data[delivered]; + info!("sending data: {:?}", num.clone()); + sock.send(AnyMessage::ReplCell(num.clone())); + + // loop { + let event = recv_chan.recv(); + match event { + NetEvent::Message(AnyMessage::Just32(flush_pos)) => { + if flush_pos == 1 + delivered as u32 { + delivered += 1; + } + } + NetEvent::Closed => { + info!("connection closed, reestablishing"); + sock = os.open_tcp(dst); + recv_chan = sock.recv_chan(); + } + _ => {} + } + + // } + } + + let sock = os.open_tcp(dst); + for num in data { + info!("sending data: {:?}", num.clone()); + sock.send(AnyMessage::ReplCell(num.clone())); + } + + info!("sent all data and finished client"); + } + + /// Run test simulations. + #[test] + fn sim_example_reliable_copy() { + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + utils::logging::Output::Stdout, + ) + .expect("logging init failed"); + + let delay = Delay { + min: 1, + max: 60, + fail_prob: 0.4, + }; + + let network = NetworkOptions { + keepalive_timeout: Some(50), + connect_delay: delay.clone(), + send_delay: delay.clone(), + }; + + for seed in 0..20 { + let u32_data: [u32; 5] = [1, 2, 3, 4, 5]; + let data = u32_to_cells(&u32_data, 1); + let world = Arc::new(World::new(seed, Arc::new(network.clone()))); + + start_simulation(Options { + world, + time_limit: 1_000_000, + client_fn: Box::new(move |os, server_id| run_client(os, &data, server_id)), + u32_data, + }); + } + } + + pub struct Options { + pub world: Arc, + pub time_limit: u64, + pub u32_data: [u32; 5], + pub client_fn: Box, + } + + pub fn start_simulation(options: Options) { + let world = options.world; + + let client_node = world.new_node(); + let server_node = world.new_node(); + let server_id = server_node.id; + + // start the client thread + client_node.launch(move |os| { + let client_fn = options.client_fn; + client_fn(os, server_id); + }); + + // start the server thread + let shared_storage = SharedStorage::new(); + let server_storage = shared_storage.clone(); + server_node.launch(move |os| run_server(os, Box::new(server_storage))); + + while world.step() && world.now() < options.time_limit {} + + let disk_data = shared_storage.state.lock().data.clone(); + assert!(verify_data(&disk_data, &options.u32_data[..])); + } + + pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec { + let mut res = Vec::new(); + for (i, _) in data.iter().enumerate() { + res.push(ReplCell { + client_id, + seqno: i as u32, + value: data[i], + }); + } + res + } + + fn verify_data(disk_data: &[u32], data: &[u32]) -> bool { + if disk_data.len() != data.len() { + return false; + } + for i in 0..data.len() { + if disk_data[i] != data[i] { + return false; + } + } + true + } +} diff --git a/libs/metrics/Cargo.toml b/libs/metrics/Cargo.toml index d4323ae766..0bd804051c 100644 --- a/libs/metrics/Cargo.toml +++ b/libs/metrics/Cargo.toml @@ -9,5 +9,15 @@ prometheus.workspace = true libc.workspace = true once_cell.workspace = true chrono.workspace = true +twox-hash.workspace = true +measured.workspace = true workspace_hack.workspace = true + +[target.'cfg(target_os = "linux")'.dependencies] +procfs.workspace = true +measured-process.workspace = true + +[dev-dependencies] +rand = "0.8" +rand_distr = "0.4.3" diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs new file mode 100644 index 0000000000..f53511ab5c --- /dev/null +++ b/libs/metrics/src/hll.rs @@ -0,0 +1,326 @@ +//! HyperLogLog is an algorithm for the count-distinct problem, +//! approximating the number of distinct elements in a multiset. +//! Calculating the exact cardinality of the distinct elements +//! of a multiset requires an amount of memory proportional to +//! the cardinality, which is impractical for very large data sets. +//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm, +//! use significantly less memory than this, but can only approximate the cardinality. + +use std::{ + hash::{BuildHasher, BuildHasherDefault, Hash}, + sync::atomic::AtomicU8, +}; + +use measured::{ + label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, + metric::{ + group::{Encoding, MetricValue}, + name::MetricNameEncoder, + Metric, MetricType, MetricVec, + }, + text::TextEncoder, + LabelGroup, +}; +use twox_hash::xxh3; + +/// Create an [`HyperLogLogVec`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_hll_vec { + ($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{ + let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap(); + $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec) + }}; + + ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{ + $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES) + }}; +} + +/// Create an [`HyperLogLog`] and registers to default registry. +#[macro_export(local_inner_macros)] +macro_rules! register_hll { + ($N:literal, $OPTS:expr $(,)?) => {{ + let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap(); + $crate::register(Box::new(hll.clone())).map(|_| hll) + }}; + + ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{ + $crate::register_hll!($N, $crate::opts!($NAME, $HELP)) + }}; +} + +/// HLL is a probabilistic cardinality measure. +/// +/// How to use this time-series for a metric name `my_metrics_total_hll`: +/// +/// ```promql +/// # harmonic mean +/// 1 / ( +/// sum ( +/// 2 ^ -( +/// # HLL merge operation +/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...) +/// ) +/// ) without (hll_shard) +/// ) +/// * alpha +/// * shards_count +/// * shards_count +/// ``` +/// +/// If you want an estimate over time, you can use the following query: +/// +/// ```promql +/// # harmonic mean +/// 1 / ( +/// sum ( +/// 2 ^ -( +/// # HLL merge operation +/// max ( +/// max_over_time(my_metrics_total_hll{}[$__rate_interval]) +/// ) by (hll_shard, other_labels...) +/// ) +/// ) without (hll_shard) +/// ) +/// * alpha +/// * shards_count +/// * shards_count +/// ``` +/// +/// In the case of low cardinality, you might want to use the linear counting approximation: +/// +/// ```promql +/// # LinearCounting(m, V) = m log (m / V) +/// shards_count * ln(shards_count / +/// # calculate V = how many shards contain a 0 +/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard) +/// ) +/// ``` +/// +/// See for estimates on alpha +pub type HyperLogLogVec = MetricVec, L>; +pub type HyperLogLog = Metric>; + +pub struct HyperLogLogState { + shards: [AtomicU8; N], +} +impl Default for HyperLogLogState { + fn default() -> Self { + #[allow(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU8 = AtomicU8::new(0); + Self { shards: [ZERO; N] } + } +} + +impl MetricType for HyperLogLogState { + type Metadata = (); +} + +impl HyperLogLogState { + pub fn measure(&self, item: &impl Hash) { + // changing the hasher will break compatibility with previous measurements. + self.record(BuildHasherDefault::::default().hash_one(item)); + } + + fn record(&self, hash: u64) { + let p = N.ilog2() as u8; + let j = hash & (N as u64 - 1); + let rho = (hash >> p).leading_zeros() as u8 + 1 - p; + self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed); + } + + fn take_sample(&self) -> [u8; N] { + self.shards.each_ref().map(|x| { + // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus. + + // This seems like it would be a race condition, + // but HLL is not impacted by a write in one shard happening in between. + // This is because in PromQL we will be implementing a harmonic mean of all buckets. + // we will also merge samples in a time series using `max by (hll_shard)`. + + // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window. + // this would mean that a dev port-forwarding the metrics url won't break the sampling. + x.swap(0, std::sync::atomic::Ordering::Relaxed) + }) + } +} +impl measured::metric::MetricEncoding> + for HyperLogLogState +{ + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + enc.write_type(&name, measured::text::MetricType::Gauge) + } + fn collect_into( + &self, + _: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + struct I64(i64); + impl LabelValue for I64 { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0) + } + } + + struct HllShardLabel { + hll_shard: i64, + } + + impl LabelGroup for HllShardLabel { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const LE: &LabelName = LabelName::from_str("hll_shard"); + v.write_value(LE, &I64(self.hll_shard)); + } + } + + self.take_sample() + .into_iter() + .enumerate() + .try_for_each(|(hll_shard, val)| { + enc.write_metric_value( + name.by_ref(), + labels.by_ref().compose_with(HllShardLabel { + hll_shard: hll_shard as i64, + }), + MetricValue::Int(val as i64), + ) + }) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use measured::{label::StaticLabelSet, FixedCardinalityLabel}; + use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand_distr::{Distribution, Zipf}; + + use crate::HyperLogLogVec; + + #[derive(FixedCardinalityLabel, Clone, Copy)] + #[label(singleton = "x")] + enum Label { + A, + B, + } + + fn collect(hll: &HyperLogLogVec, 32>) -> ([u8; 32], [u8; 32]) { + // cannot go through the `hll.collect_family_into` interface yet... + // need to see if I can fix the conflicting impls problem in measured. + ( + hll.get_metric(hll.with_labels(Label::A)).take_sample(), + hll.get_metric(hll.with_labels(Label::B)).take_sample(), + ) + } + + fn get_cardinality(samples: &[[u8; 32]]) -> f64 { + let mut buckets = [0.0; 32]; + for &sample in samples { + for (i, m) in sample.into_iter().enumerate() { + buckets[i] = f64::max(buckets[i], m as f64); + } + } + + buckets + .into_iter() + .map(|f| 2.0f64.powf(-f)) + .sum::() + .recip() + * 0.697 + * 32.0 + * 32.0 + } + + fn test_cardinality(n: usize, dist: impl Distribution) -> ([usize; 3], [f64; 3]) { + let hll = HyperLogLogVec::, 32>::new(); + + let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist); + let mut set_a = HashSet::new(); + let mut set_b = HashSet::new(); + + for x in iter.by_ref().take(n) { + set_a.insert(x.to_bits()); + hll.get_metric(hll.with_labels(Label::A)) + .measure(&x.to_bits()); + } + for x in iter.by_ref().take(n) { + set_b.insert(x.to_bits()); + hll.get_metric(hll.with_labels(Label::B)) + .measure(&x.to_bits()); + } + let merge = &set_a | &set_b; + + let (a, b) = collect(&hll); + let len = get_cardinality(&[a, b]); + let len_a = get_cardinality(&[a]); + let len_b = get_cardinality(&[b]); + + ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b]) + } + + #[test] + fn test_cardinality_small() { + let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap()); + + assert_eq!(actual, [46, 30, 32]); + assert!(51.3 < estimate[0] && estimate[0] < 51.4); + assert!(44.0 < estimate[1] && estimate[1] < 44.1); + assert!(39.0 < estimate[2] && estimate[2] < 39.1); + } + + #[test] + fn test_cardinality_medium() { + let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap()); + + assert_eq!(actual, [2529, 1618, 1629]); + assert!(2309.1 < estimate[0] && estimate[0] < 2309.2); + assert!(1566.6 < estimate[1] && estimate[1] < 1566.7); + assert!(1629.5 < estimate[2] && estimate[2] < 1629.6); + } + + #[test] + fn test_cardinality_large() { + let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap()); + + assert_eq!(actual, [129077, 79579, 79630]); + assert!(126067.2 < estimate[0] && estimate[0] < 126067.3); + assert!(83076.8 < estimate[1] && estimate[1] < 83076.9); + assert!(64251.2 < estimate[2] && estimate[2] < 64251.3); + } + + #[test] + fn test_cardinality_small2() { + let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap()); + + assert_eq!(actual, [92, 58, 60]); + assert!(116.1 < estimate[0] && estimate[0] < 116.2); + assert!(81.7 < estimate[1] && estimate[1] < 81.8); + assert!(69.3 < estimate[2] && estimate[2] < 69.4); + } + + #[test] + fn test_cardinality_medium2() { + let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap()); + + assert_eq!(actual, [8201, 5131, 5051]); + assert!(6846.4 < estimate[0] && estimate[0] < 6846.5); + assert!(5239.1 < estimate[1] && estimate[1] < 5239.2); + assert!(4292.8 < estimate[2] && estimate[2] < 4292.9); + } + + #[test] + fn test_cardinality_large2() { + let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap()); + + assert_eq!(actual, [777847, 482069, 482246]); + assert!(699437.4 < estimate[0] && estimate[0] < 699437.5); + assert!(374948.9 < estimate[1] && estimate[1] < 374949.0); + assert!(434609.7 < estimate[2] && estimate[2] < 434609.8); + } +} diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs index d09ba11344..141d8a6d01 100644 --- a/libs/metrics/src/lib.rs +++ b/libs/metrics/src/lib.rs @@ -4,6 +4,17 @@ //! a default registry. #![deny(clippy::undocumented_unsafe_blocks)] +use measured::{ + label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels}, + metric::{ + counter::CounterState, + gauge::GaugeState, + group::{Encoding, MetricValue}, + name::{MetricName, MetricNameEncoder}, + MetricEncoding, MetricFamilyEncoding, + }, + FixedCardinalityLabel, LabelGroup, MetricGroup, +}; use once_cell::sync::Lazy; use prometheus::core::{ Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec, @@ -11,6 +22,7 @@ use prometheus::core::{ pub use prometheus::opts; pub use prometheus::register; pub use prometheus::Error; +use prometheus::Registry; pub use prometheus::{core, default_registry, proto}; pub use prometheus::{exponential_buckets, linear_buckets}; pub use prometheus::{register_counter_vec, Counter, CounterVec}; @@ -23,12 +35,14 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec}; pub use prometheus::{register_int_gauge, IntGauge}; pub use prometheus::{register_int_gauge_vec, IntGaugeVec}; pub use prometheus::{Encoder, TextEncoder}; -use prometheus::{Registry, Result}; pub mod launch_timestamp; mod wrappers; pub use wrappers::{CountedReader, CountedWriter}; -pub mod metric_vec_duration; +mod hll; +pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec}; +#[cfg(target_os = "linux")] +pub mod more_process_metrics; pub type UIntGauge = GenericGauge; pub type UIntGaugeVec = GenericGaugeVec; @@ -56,7 +70,7 @@ static INTERNAL_REGISTRY: Lazy = Lazy::new(Registry::new); /// Register a collector in the internal registry. MUST be called before the first call to `gather()`. /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector /// while holding the lock. -pub fn register_internal(c: Box) -> Result<()> { +pub fn register_internal(c: Box) -> prometheus::Result<()> { INTERNAL_REGISTRY.register(c) } @@ -93,6 +107,127 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, ]; +pub struct BuildInfo { + pub revision: &'static str, + pub build_tag: &'static str, +} + +// todo: allow label group without the set +impl LabelGroup for BuildInfo { + fn visit_values(&self, v: &mut impl LabelGroupVisitor) { + const REVISION: &LabelName = LabelName::from_str("revision"); + v.write_value(REVISION, &self.revision); + const BUILD_TAG: &LabelName = LabelName::from_str("build_tag"); + v.write_value(BUILD_TAG, &self.build_tag); + } +} + +impl MetricFamilyEncoding for BuildInfo +where + GaugeState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + enc.write_help(&name, "Build/version information")?; + GaugeState::write_type(&name, enc)?; + GaugeState { + count: std::sync::atomic::AtomicI64::new(1), + } + .collect_into(&(), self, name, enc) + } +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct NeonMetrics { + #[cfg(target_os = "linux")] + #[metric(namespace = "process")] + #[metric(init = measured_process::ProcessCollector::for_self())] + process: measured_process::ProcessCollector, + + #[metric(namespace = "libmetrics")] + #[metric(init = LibMetrics::new(build_info))] + libmetrics: LibMetrics, +} + +#[derive(MetricGroup)] +#[metric(new(build_info: BuildInfo))] +pub struct LibMetrics { + #[metric(init = build_info)] + build_info: BuildInfo, + + #[metric(flatten)] + rusage: Rusage, + + serve_count: CollectionCounter, +} + +fn write_gauge( + x: i64, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Enc, +) -> Result<(), Enc::Err> { + enc.write_metric_value(name, labels, MetricValue::Int(x)) +} + +#[derive(Default)] +struct Rusage; + +#[derive(FixedCardinalityLabel, Clone, Copy)] +#[label(singleton = "io_operation")] +enum IoOp { + Read, + Write, +} + +impl MetricGroup for Rusage +where + GaugeState: MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total"); + const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb"); + + let ru = get_rusage_stats(); + + enc.write_help( + DISK_IO, + "Bytes written and read from disk, grouped by the operation (read|write)", + )?; + GaugeState::write_type(DISK_IO, enc)?; + write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?; + write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?; + + enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?; + GaugeState::write_type(MAXRSS, enc)?; + write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?; + + Ok(()) + } +} + +#[derive(Default)] +struct CollectionCounter(CounterState); + +impl MetricFamilyEncoding for CollectionCounter +where + CounterState: MetricEncoding, +{ + fn collect_family_into( + &self, + name: impl measured::metric::name::MetricNameEncoder, + enc: &mut T, + ) -> Result<(), T::Err> { + self.0.inc(); + enc.write_help(&name, "Number of metric requests made")?; + self.0.collect_into(&(), NoLabels, name, enc) + } +} + pub fn set_build_info_metric(revision: &str, build_tag: &str) { let metric = register_int_gauge_vec!( "libmetrics_build_info", @@ -102,6 +237,7 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { .expect("Failed to register build info metric"); metric.with_label_values(&[revision, build_tag]).set(1); } +const BYTES_IN_BLOCK: i64 = 512; // Records I/O stats in a "cross-platform" way. // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats. @@ -111,18 +247,25 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) { // performed by the process. // We know the size of the block, so we can determine the I/O bytes out of it. // The value might be not 100% exact, but should be fine for Prometheus metrics in this case. -#[allow(clippy::unnecessary_cast)] fn update_rusage_metrics() { let rusage_stats = get_rusage_stats(); - const BYTES_IN_BLOCK: i64 = 512; DISK_IO_BYTES .with_label_values(&["read"]) .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK); DISK_IO_BYTES .with_label_values(&["write"]) .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK); - MAXRSS_KB.set(rusage_stats.ru_maxrss); + + // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669 + #[cfg(target_os = "macos")] + { + MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024); + } + #[cfg(not(target_os = "macos"))] + { + MAXRSS_KB.set(rusage_stats.ru_maxrss); + } } fn get_rusage_stats() -> libc::rusage { @@ -149,6 +292,7 @@ macro_rules! register_int_counter_pair_vec { } }}; } + /// Create an [`IntCounterPair`] and registers to default registry. #[macro_export(local_inner_macros)] macro_rules! register_int_counter_pair { @@ -186,7 +330,10 @@ impl GenericCounterPairVec

{ /// /// An error is returned if the number of label values is not the same as the /// number of VariableLabels in Desc. - pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result> { + pub fn get_metric_with_label_values( + &self, + vals: &[&str], + ) -> prometheus::Result> { Ok(GenericCounterPair { inc: self.inc.get_metric_with_label_values(vals)?, dec: self.dec.get_metric_with_label_values(vals)?, @@ -198,6 +345,11 @@ impl GenericCounterPairVec

{ pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair

{ self.get_metric_with_label_values(vals).unwrap() } + + pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) { + res[0] = self.inc.remove_label_values(vals); + res[1] = self.dec.remove_label_values(vals); + } } impl GenericCounterPair

{ @@ -244,6 +396,15 @@ impl GenericCounterPair

{ } } +impl Clone for GenericCounterPair

{ + fn clone(&self) -> Self { + Self { + inc: self.inc.clone(), + dec: self.dec.clone(), + } + } +} + /// Guard returned by [`GenericCounterPair::guard`] pub struct GenericCounterPairGuard(GenericCounter

); @@ -269,3 +430,180 @@ pub type IntCounterPair = GenericCounterPair; /// A guard for [`IntCounterPair`] that will decrement the gauge on drop pub type IntCounterPairGuard = GenericCounterPairGuard; + +pub trait CounterPairAssoc { + const INC_NAME: &'static MetricName; + const DEC_NAME: &'static MetricName; + + const INC_HELP: &'static str; + const DEC_HELP: &'static str; + + type LabelGroupSet: LabelGroupSet; +} + +pub struct CounterPairVec { + vec: measured::metric::MetricVec, +} + +impl Default for CounterPairVec +where + A::LabelGroupSet: Default, +{ + fn default() -> Self { + Self { + vec: Default::default(), + } + } +} + +impl CounterPairVec { + pub fn guard( + &self, + labels: ::Group<'_>, + ) -> MeasuredCounterPairGuard<'_, A> { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + MeasuredCounterPairGuard { vec: &self.vec, id } + } + pub fn inc(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).inc.inc(); + } + pub fn dec(&self, labels: ::Group<'_>) { + let id = self.vec.with_labels(labels); + self.vec.get_metric(id).dec.inc(); + } + pub fn remove_metric( + &self, + labels: ::Group<'_>, + ) -> Option { + let id = self.vec.with_labels(labels); + self.vec.remove_metric(id) + } + + pub fn sample(&self, labels: ::Group<'_>) -> u64 { + let id = self.vec.with_labels(labels); + let metric = self.vec.get_metric(id); + + let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed); + let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed); + inc.saturating_sub(dec) + } +} + +impl ::measured::metric::group::MetricGroup for CounterPairVec +where + T: ::measured::metric::group::Encoding, + A: CounterPairAssoc, + ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding, +{ + fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> { + // write decrement first to avoid a race condition where inc - dec < 0 + T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?; + self.vec + .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?; + + T::write_help(enc, A::INC_NAME, A::INC_HELP)?; + self.vec + .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?; + + Ok(()) + } +} + +#[derive(MetricGroup, Default)] +pub struct MeasuredCounterPairState { + pub inc: CounterState, + pub dec: CounterState, +} + +impl measured::metric::MetricType for MeasuredCounterPairState { + type Metadata = (); +} + +pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> { + vec: &'a measured::metric::MetricVec, + id: measured::metric::LabelId, +} + +impl Drop for MeasuredCounterPairGuard<'_, A> { + fn drop(&mut self) { + self.vec.get_metric(self.id).dec.inc(); + } +} + +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder. +struct Inc(T); +/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder. +struct Dec(T); + +impl Encoding for Inc { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Inc) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Inc, + ) -> Result<(), T::Err> { + self.inc.collect_into(metadata, labels, name, &mut enc.0) + } +} + +impl Encoding for Dec { + type Err = T::Err; + + fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { + self.0.write_help(name, help) + } + + fn write_metric_value( + &mut self, + name: impl MetricNameEncoder, + labels: impl LabelGroup, + value: MetricValue, + ) -> Result<(), Self::Err> { + self.0.write_metric_value(name, labels, value) + } +} + +/// Write the dec counter to the encoder +impl MetricEncoding> for MeasuredCounterPairState +where + CounterState: MetricEncoding, +{ + fn write_type(name: impl MetricNameEncoder, enc: &mut Dec) -> Result<(), T::Err> { + CounterState::write_type(name, &mut enc.0) + } + fn collect_into( + &self, + metadata: &(), + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut Dec, + ) -> Result<(), T::Err> { + self.dec.collect_into(metadata, labels, name, &mut enc.0) + } +} diff --git a/libs/metrics/src/metric_vec_duration.rs b/libs/metrics/src/metric_vec_duration.rs deleted file mode 100644 index e9a0a65570..0000000000 --- a/libs/metrics/src/metric_vec_duration.rs +++ /dev/null @@ -1,23 +0,0 @@ -//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec`. - -use std::{future::Future, time::Instant}; - -pub trait DurationResultObserver { - fn observe_result(&self, res: &Result, duration: std::time::Duration); -} - -pub async fn observe_async_block_duration_by_result< - T, - E, - F: Future>, - O: DurationResultObserver, ->( - observer: &O, - block: F, -) -> Result { - let start = Instant::now(); - let result = block.await; - let duration = start.elapsed(); - observer.observe_result(&result, duration); - result -} diff --git a/libs/metrics/src/more_process_metrics.rs b/libs/metrics/src/more_process_metrics.rs new file mode 100644 index 0000000000..920724fdec --- /dev/null +++ b/libs/metrics/src/more_process_metrics.rs @@ -0,0 +1,54 @@ +//! process metrics that the [`::prometheus`] crate doesn't provide. + +// This module has heavy inspiration from the prometheus crate's `process_collector.rs`. + +use crate::UIntGauge; + +pub struct Collector { + descs: Vec, + vmlck: crate::UIntGauge, +} + +const NMETRICS: usize = 1; + +impl prometheus::core::Collector for Collector { + fn desc(&self) -> Vec<&prometheus::core::Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let Ok(myself) = procfs::process::Process::myself() else { + return vec![]; + }; + let mut mfs = Vec::with_capacity(NMETRICS); + if let Ok(status) = myself.status() { + if let Some(vmlck) = status.vmlck { + self.vmlck.set(vmlck); + mfs.extend(self.vmlck.collect()) + } + } + mfs + } +} + +impl Collector { + pub fn new() -> Self { + let mut descs = Vec::new(); + + let vmlck = + UIntGauge::new("libmetrics_process_status_vmlck", "/proc/self/status vmlck").unwrap(); + descs.extend( + prometheus::core::Collector::desc(&vmlck) + .into_iter() + .cloned(), + ); + + Self { descs, vmlck } + } +} + +impl Default for Collector { + fn default() -> Self { + Self::new() + } +} diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 4146597d8d..3bba89c76d 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -18,7 +18,11 @@ enum-map.workspace = true strum.workspace = true strum_macros.workspace = true hex.workspace = true +humantime.workspace = true thiserror.workspace = true +humantime-serde.workspace = true +chrono.workspace = true +itertools.workspace = true workspace_hack.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs new file mode 100644 index 0000000000..d996a62349 --- /dev/null +++ b/libs/pageserver_api/src/config.rs @@ -0,0 +1,31 @@ +use std::collections::HashMap; + +use const_format::formatcp; + +#[cfg(test)] +mod tests; + +pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; +pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); +pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; +pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); + +// Certain metadata (e.g. externally-addressable name, AZ) is delivered +// as a separate structure. This information is not neeed by the pageserver +// itself, it is only used for registering the pageserver with the control +// plane and/or storage controller. +// +#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)] +pub struct NodeMetadata { + #[serde(rename = "host")] + pub postgres_host: String, + #[serde(rename = "port")] + pub postgres_port: u16, + pub http_host: String, + pub http_port: u16, + + // Deployment tools may write fields to the metadata file beyond what we + // use in this type: this type intentionally only names fields that require. + #[serde(flatten)] + pub other: HashMap, +} diff --git a/libs/pageserver_api/src/config/tests.rs b/libs/pageserver_api/src/config/tests.rs new file mode 100644 index 0000000000..edeefc156e --- /dev/null +++ b/libs/pageserver_api/src/config/tests.rs @@ -0,0 +1,22 @@ +use super::*; + +#[test] +fn test_node_metadata_v1_backward_compatibilty() { + let v1 = serde_json::to_vec(&serde_json::json!({ + "host": "localhost", + "port": 23, + "http_host": "localhost", + "http_port": 42, + })); + + assert_eq!( + serde_json::from_slice::(&v1.unwrap()).unwrap(), + NodeMetadata { + postgres_host: "localhost".to_string(), + postgres_port: 23, + http_host: "localhost".to_string(), + http_port: 42, + other: HashMap::new(), + } + ) +} diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs new file mode 100644 index 0000000000..a0d10dc665 --- /dev/null +++ b/libs/pageserver_api/src/controller_api.rs @@ -0,0 +1,283 @@ +use std::str::FromStr; + +/// Request/response types for the storage controller +/// API (`/control/v1` prefix). Implemented by the server +/// in [`storage_controller::http`] +use serde::{Deserialize, Serialize}; +use utils::id::{NodeId, TenantId}; + +use crate::{ + models::{ShardParameters, TenantConfig}, + shard::{ShardStripeSize, TenantShardId}, +}; + +#[derive(Serialize, Deserialize)] +pub struct TenantCreateResponseShard { + pub shard_id: TenantShardId, + pub node_id: NodeId, + pub generation: u32, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantCreateResponse { + pub shards: Vec, +} + +#[derive(Serialize, Deserialize)] +pub struct NodeRegisterRequest { + pub node_id: NodeId, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, + + pub listen_http_addr: String, + pub listen_http_port: u16, +} + +#[derive(Serialize, Deserialize)] +pub struct NodeConfigureRequest { + pub node_id: NodeId, + + pub availability: Option, + pub scheduling: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantPolicyRequest { + pub placement: Option, + pub scheduling: Option, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantLocateResponseShard { + pub shard_id: TenantShardId, + pub node_id: NodeId, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, + + pub listen_http_addr: String, + pub listen_http_port: u16, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantLocateResponse { + pub shards: Vec, + pub shard_params: ShardParameters, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantDescribeResponse { + pub tenant_id: TenantId, + pub shards: Vec, + pub stripe_size: ShardStripeSize, + pub policy: PlacementPolicy, + pub config: TenantConfig, +} + +#[derive(Serialize, Deserialize)] +pub struct NodeDescribeResponse { + pub id: NodeId, + + pub availability: NodeAvailabilityWrapper, + pub scheduling: NodeSchedulingPolicy, + + pub listen_http_addr: String, + pub listen_http_port: u16, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantDescribeResponseShard { + pub tenant_shard_id: TenantShardId, + + pub node_attached: Option, + pub node_secondary: Vec, + + pub last_error: String, + + /// A task is currently running to reconcile this tenant's intent state with the state on pageservers + pub is_reconciling: bool, + /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending. + pub is_pending_compute_notification: bool, + /// A shard split is currently underway + pub is_splitting: bool, + + pub scheduling_policy: ShardSchedulingPolicy, +} + +/// Explicitly migrating a particular shard is a low level operation +/// TODO: higher level "Reschedule tenant" operation where the request +/// specifies some constraints, e.g. asking it to get off particular node(s) +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantShardMigrateRequest { + pub tenant_shard_id: TenantShardId, + pub node_id: NodeId, +} + +/// Utilisation score indicating how good a candidate a pageserver +/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`]. +/// Lower values are better. +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)] +pub struct UtilizationScore(pub u64); + +impl UtilizationScore { + pub fn worst() -> Self { + UtilizationScore(u64::MAX) + } +} + +#[derive(Serialize, Deserialize, Clone, Copy, Debug)] +#[serde(into = "NodeAvailabilityWrapper")] +pub enum NodeAvailability { + // Normal, happy state + Active(UtilizationScore), + // Offline: Tenants shouldn't try to attach here, but they may assume that their + // secondary locations on this node still exist. Newly added nodes are in this + // state until we successfully contact them. + Offline, +} + +impl PartialEq for NodeAvailability { + fn eq(&self, other: &Self) -> bool { + use NodeAvailability::*; + matches!((self, other), (Active(_), Active(_)) | (Offline, Offline)) + } +} + +impl Eq for NodeAvailability {} + +// This wrapper provides serde functionality and it should only be used to +// communicate with external callers which don't know or care about the +// utilisation score of the pageserver it is targeting. +#[derive(Serialize, Deserialize, Clone, Copy, Debug)] +pub enum NodeAvailabilityWrapper { + Active, + Offline, +} + +impl From for NodeAvailability { + fn from(val: NodeAvailabilityWrapper) -> Self { + match val { + // Assume the worst utilisation score to begin with. It will later be updated by + // the heartbeats. + NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()), + NodeAvailabilityWrapper::Offline => NodeAvailability::Offline, + } + } +} + +impl From for NodeAvailabilityWrapper { + fn from(val: NodeAvailability) -> Self { + match val { + NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active, + NodeAvailability::Offline => NodeAvailabilityWrapper::Offline, + } + } +} + +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] +pub enum ShardSchedulingPolicy { + // Normal mode: the tenant's scheduled locations may be updated at will, including + // for non-essential optimization. + Active, + + // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy. + // For example, this still permits a node's attachment location to change to a secondary in + // response to a node failure, or to assign a new secondary if a node was removed. + Essential, + + // No scheduling: leave the shard running wherever it currently is. Even if the shard is + // unavailable, it will not be rescheduled to another node. + Pause, + + // No reconciling: we will make no location_conf API calls to pageservers at all. If the + // shard is unavailable, it stays that way. If a node fails, this shard doesn't get failed over. + Stop, +} + +impl Default for ShardSchedulingPolicy { + fn default() -> Self { + Self::Active + } +} + +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] +pub enum NodeSchedulingPolicy { + Active, + Filling, + Pause, + PauseForRestart, + Draining, +} + +impl FromStr for NodeSchedulingPolicy { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self::Active), + "filling" => Ok(Self::Filling), + "pause" => Ok(Self::Pause), + "pause_for_restart" => Ok(Self::PauseForRestart), + "draining" => Ok(Self::Draining), + _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")), + } + } +} + +impl From for String { + fn from(value: NodeSchedulingPolicy) -> String { + use NodeSchedulingPolicy::*; + match value { + Active => "active", + Filling => "filling", + Pause => "pause", + PauseForRestart => "pause_for_restart", + Draining => "draining", + } + .to_string() + } +} + +/// Controls how tenant shards are mapped to locations on pageservers, e.g. whether +/// to create secondary locations. +#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)] +pub enum PlacementPolicy { + /// Normal live state: one attached pageserver and zero or more secondaries. + Attached(usize), + /// Create one secondary mode locations. This is useful when onboarding + /// a tenant, or for an idle tenant that we might want to bring online quickly. + Secondary, + + /// Do not attach to any pageservers. This is appropriate for tenants that + /// have been idle for a long time, where we do not mind some delay in making + /// them available in future. + Detached, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantShardMigrateResponse {} + +#[cfg(test)] +mod test { + use super::*; + use serde_json; + + /// Check stability of PlacementPolicy's serialization + #[test] + fn placement_policy_encoding() -> anyhow::Result<()> { + let v = PlacementPolicy::Attached(1); + let encoded = serde_json::to_string(&v)?; + assert_eq!(encoded, "{\"Attached\":1}"); + assert_eq!(serde_json::from_str::(&encoded)?, v); + + let v = PlacementPolicy::Detached; + let encoded = serde_json::to_string(&v)?; + assert_eq!(encoded, "\"Detached\""); + assert_eq!(serde_json::from_str::(&encoded)?, v); + Ok(()) + } +} diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 3e1bba2a06..997c1cc43a 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,7 +1,12 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::RepOriginId; +use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; -use std::fmt; +use std::{fmt, ops::Range}; + +use crate::reltag::{BlockNumber, RelTag, SlruKind}; /// Key used in the Repository kv-store. /// @@ -17,15 +22,93 @@ pub struct Key { pub field6: u32, } +/// The storage key size. pub const KEY_SIZE: usize = 18; +/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized. +/// See [`Key::to_i128`] for more information on the encoding. +pub const METADATA_KEY_SIZE: usize = 16; + +/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key. +pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60; +pub const METADATA_KEY_END_PREFIX: u8 = 0x7F; + +/// The (reserved) key prefix of relation sizes. +pub const RELATION_SIZE_PREFIX: u8 = 0x61; + +/// The key prefix of AUX file keys. +pub const AUX_KEY_PREFIX: u8 = 0x62; + +/// The key prefix of ReplOrigin keys. +pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63; + +/// Check if the key falls in the range of metadata keys. +pub const fn is_metadata_key_slice(key: &[u8]) -> bool { + key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX +} + impl Key { + /// Check if the key falls in the range of metadata keys. + pub const fn is_metadata_key(&self) -> bool { + self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self { + assert!(is_metadata_key_slice(key), "key not in metadata key range"); + // Metadata key space ends at 0x7F so it's fine to directly convert it to i128. + Self::from_i128(i128::from_be_bytes(*key)) + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key(key: &[u8]) -> Self { + Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key")) + } + + /// Get the range of metadata keys. + pub const fn metadata_key_range() -> Range { + Key { + field1: METADATA_KEY_BEGIN_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: METADATA_KEY_END_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + + /// Get the range of aux keys. + pub fn metadata_aux_key_range() -> Range { + Key { + field1: AUX_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: AUX_KEY_PREFIX + 1, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } + } + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { - assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); - (((self.field1 & 0xf) as i128) << 120) + assert!(self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222); + (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) | ((self.field4 as i128) << 40) @@ -35,7 +118,7 @@ impl Key { pub const fn from_i128(x: i128) -> Self { Key { - field1: ((x >> 120) & 0xf) as u8, + field1: ((x >> 120) & 0x7F) as u8, field2: ((x >> 104) & 0xFFFF) as u32, field3: (x >> 72) as u32, field4: (x >> 40) as u32, @@ -44,11 +127,11 @@ impl Key { } } - pub fn next(&self) -> Key { + pub const fn next(&self) -> Key { self.add(1) } - pub fn add(&self, x: u32) -> Key { + pub const fn add(&self, x: u32) -> Key { let mut key = *self; let r = key.field6.overflowing_add(x); @@ -77,6 +160,8 @@ impl Key { key } + /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently. + /// Use [`Key::from_i128`] instead if you want to handle 16B keys (i.e., metadata keys). pub fn from_slice(b: &[u8]) -> Self { Key { field1: b[0], @@ -88,6 +173,8 @@ impl Key { } } + /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently. + /// Use [`Key::to_i128`] instead if you want to get a 16B key (i.e., metadata keys). pub fn write_to_byte_slice(&self, buf: &mut [u8]) { buf[0] = self.field1; BE::write_u32(&mut buf[1..5], self.field2); @@ -141,8 +228,480 @@ impl Key { } } -pub fn is_rel_block_key(key: &Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff +// Layout of the Key address space +// +// The Key struct, used to address the underlying key-value store, consists of +// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map +// all the data and metadata keys into those 18 bytes. +// +// Principles for the mapping: +// +// - Things that are often accessed or modified together, should be close to +// each other in the key space. For example, if a relation is extended by one +// block, we create a new key-value pair for the block data, and update the +// relation size entry. Because of that, the RelSize key comes after all the +// RelBlocks of a relation: the RelSize and the last RelBlock are always next +// to each other. +// +// The key space is divided into four major sections, identified by the first +// byte, and the form a hierarchy: +// +// 00 Relation data and metadata +// +// DbDir () -> (dbnode, spcnode) +// Filenodemap +// RelDir -> relnode forknum +// RelBlocks +// RelSize +// +// 01 SLRUs +// +// SlruDir kind +// SlruSegBlocks segno +// SlruSegSize +// +// 02 pg_twophase +// +// 03 misc +// Controlfile +// checkpoint +// pg_version +// +// 04 aux files +// +// Below is a full list of the keyspace allocation: +// +// DbDir: +// 00 00000000 00000000 00000000 00 00000000 +// +// Filenodemap: +// 00 SPCNODE DBNODE 00000000 00 00000000 +// +// RelDir: +// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) +// +// RelBlock: +// 00 SPCNODE DBNODE RELNODE FORK BLKNUM +// +// RelSize: +// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF +// +// SlruDir: +// 01 kind 00000000 00000000 00 00000000 +// +// SlruSegBlock: +// 01 kind 00000001 SEGNO 00 BLKNUM +// +// SlruSegSize: +// 01 kind 00000001 SEGNO 00 FFFFFFFF +// +// TwoPhaseDir: +// 02 00000000 00000000 00000000 00 00000000 +// +// TwoPhaseFile: +// 02 00000000 00000000 00000000 00 XID +// +// ControlFile: +// 03 00000000 00000000 00000000 00 00000000 +// +// Checkpoint: +// 03 00000000 00000000 00000000 00 00000001 +// +// AuxFiles: +// 03 00000000 00000000 00000000 00 00000002 +// + +//-- Section 01: relation data and metadata + +pub const DBDIR_KEY: Key = Key { + field1: 0x00, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +#[inline(always)] +pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0xffffffff, + field5: 0xff, + field6: 0xffffffff, + } +} + +#[inline(always)] +pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + } +} + +#[inline(always)] +pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 1, + } +} + +#[inline(always)] +pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} + +#[inline(always)] +pub fn rel_size_to_key(rel: RelTag) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0xffff_ffff, + } +} + +impl Key { + #[inline(always)] + pub fn is_rel_size_key(&self) -> bool { + self.field1 == 0 && self.field6 == u32::MAX + } +} + +#[inline(always)] +pub fn rel_key_range(rel: RelTag) -> Range { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0, + }..Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum + 1, + field6: 0, + } +} + +//-- Section 02: SLRUs + +#[inline(always)] +pub fn slru_dir_to_key(kind: SlruKind) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } +} + +#[inline(always)] +pub fn slru_dir_kind(key: &Key) -> Option> { + if key.field1 == 0x01 + && key.field3 == 0 + && key.field4 == 0 + && key.field5 == 0 + && key.field6 == 0 + { + match key.field2 { + 0 => Some(Ok(SlruKind::Clog)), + 1 => Some(Ok(SlruKind::MultiXactMembers)), + 2 => Some(Ok(SlruKind::MultiXactOffsets)), + x => Some(Err(x)), + } + } else { + None + } +} + +#[inline(always)] +pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: blknum, + } +} + +#[inline(always)] +pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: 0xffff_ffff, + } +} + +impl Key { + pub fn is_slru_segment_size_key(&self) -> bool { + self.field1 == 0x01 + && self.field2 < 0x03 + && self.field3 == 0x01 + && self.field5 == 0 + && self.field6 == u32::MAX + } +} + +#[inline(always)] +pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { + let field2 = match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }; + + Key { + field1: 0x01, + field2, + field3: 1, + field4: segno, + field5: 0, + field6: 0, + }..Key { + field1: 0x01, + field2, + field3: 1, + field4: segno, + field5: 1, + field6: 0, + } +} + +//-- Section 03: pg_twophase + +pub const TWOPHASEDIR_KEY: Key = Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +#[inline(always)] +pub fn twophase_file_key(xid: TransactionId) -> Key { + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + } +} + +#[inline(always)] +pub fn twophase_key_range(xid: TransactionId) -> Range { + let (next_xid, overflowed) = xid.overflowing_add(1); + + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + }..Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: u8::from(overflowed), + field6: next_xid, + } +} + +//-- Section 03: Control file +pub const CONTROLFILE_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +pub const CHECKPOINT_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 1, +}; + +pub const AUX_FILES_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 2, +}; + +#[inline(always)] +pub fn repl_origin_key(origin_id: RepOriginId) -> Key { + Key { + field1: REPL_ORIGIN_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: origin_id as u32, + } +} + +/// Get the range of replorigin keys. +pub fn repl_origin_key_range() -> Range { + Key { + field1: REPL_ORIGIN_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: REPL_ORIGIN_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0x10000, + } +} + +// Reverse mappings for a few Keys. +// These are needed by WAL redo manager. + +/// Non inherited range for vectored get. +pub const NON_INHERITED_RANGE: Range = AUX_FILES_KEY..AUX_FILES_KEY.next(); +/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range. +pub const NON_INHERITED_SPARSE_RANGE: Range = Key::metadata_key_range(); + +impl Key { + // AUX_FILES currently stores only data for logical replication (slots etc), and + // we don't preserve these on a branch because safekeepers can't follow timeline + // switch (and generally it likely should be optional), so ignore these. + #[inline(always)] + pub fn is_inherited_key(self) -> bool { + !NON_INHERITED_RANGE.contains(&self) && !NON_INHERITED_SPARSE_RANGE.contains(&self) + } + + #[inline(always)] + pub fn is_rel_fsm_block_key(self) -> bool { + self.field1 == 0x00 + && self.field4 != 0 + && self.field5 == FSM_FORKNUM + && self.field6 != 0xffffffff + } + + #[inline(always)] + pub fn is_rel_vm_block_key(self) -> bool { + self.field1 == 0x00 + && self.field4 != 0 + && self.field5 == VISIBILITYMAP_FORKNUM + && self.field6 != 0xffffffff + } + + #[inline(always)] + pub fn to_slru_block(self) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { + Ok(match self.field1 { + 0x01 => { + let kind = match self.field2 { + 0x00 => SlruKind::Clog, + 0x01 => SlruKind::MultiXactMembers, + 0x02 => SlruKind::MultiXactOffsets, + _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", self.field2), + }; + let segno = self.field4; + let blknum = self.field6; + + (kind, segno, blknum) + } + _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + }) + } + + #[inline(always)] + pub fn is_slru_block_key(self) -> bool { + self.field1 == 0x01 // SLRU-related + && self.field3 == 0x00000001 // but not SlruDir + && self.field6 != 0xffffffff // and not SlruSegSize + } + + #[inline(always)] + pub fn is_rel_block_key(&self) -> bool { + self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff + } + + /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. + #[inline(always)] + pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { + Ok(match self.field1 { + 0x00 => ( + RelTag { + spcnode: self.field2, + dbnode: self.field3, + relnode: self.field4, + forknum: self.field5, + }, + self.field6, + ), + _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + }) + } } impl std::str::FromStr for Key { @@ -157,11 +716,14 @@ impl std::str::FromStr for Key { mod tests { use std::str::FromStr; + use crate::key::is_metadata_key_slice; use crate::key::Key; use rand::Rng; use rand::SeedableRng; + use super::AUX_KEY_PREFIX; + #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); @@ -177,4 +739,21 @@ mod tests { assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); } + + #[test] + fn test_metadata_keys() { + let mut metadata_key = vec![AUX_KEY_PREFIX]; + metadata_key.extend_from_slice(&[0xFF; 15]); + let encoded_key = Key::from_metadata_key(&metadata_key); + let output_key = encoded_key.to_i128().to_be_bytes(); + assert_eq!(metadata_key, output_key); + assert!(encoded_key.is_metadata_key()); + assert!(is_metadata_key_slice(&metadata_key)); + } + + #[test] + fn test_possible_largest_key() { + Key::from_i128(0x7FFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF_FFFF); + // TODO: put this key into the system and see if anything breaks. + } } diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 80183506d8..9a61f2ad81 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -1,7 +1,11 @@ use postgres_ffi::BLCKSZ; use std::ops::Range; -use crate::key::Key; +use crate::{ + key::Key, + shard::{ShardCount, ShardIdentity}, +}; +use itertools::Itertools; /// /// Represents a set of Keys, in a compact form. @@ -13,44 +17,279 @@ pub struct KeySpace { pub ranges: Vec>, } -impl KeySpace { +/// A wrapper type for sparse keyspaces. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct SparseKeySpace(pub KeySpace); + +/// Represents a contiguous half-open range of the keyspace, masked according to a particular +/// ShardNumber's stripes: within this range of keys, only some "belong" to the current +/// shard. +/// +/// When we iterate over keys within this object, we will skip any keys that don't belong +/// to this shard. +/// +/// The start + end keys may not belong to the shard: these specify where layer files should +/// start + end, but we will never actually read/write those keys. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ShardedRange<'a> { + pub shard_identity: &'a ShardIdentity, + pub range: Range, +} + +// Calculate the size of a range within the blocks of the same relation, or spanning only the +// top page in the previous relation's space. +fn contiguous_range_len(range: &Range) -> u32 { + debug_assert!(is_contiguous_range(range)); + if range.start.field6 == 0xffffffff { + range.end.field6 + 1 + } else { + range.end.field6 - range.start.field6 + } +} + +/// Return true if this key range includes only keys in the same relation's data blocks, or +/// just spanning one relation and the logical size (0xffffffff) block of the relation before it. +/// +/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not +/// be on our shard. Later in ShardedRange we do the extra work to figure out how much +/// of a given contiguous range is present on one shard. +/// +/// This matters, because: +/// - Within such ranges, keys are used contiguously. Outside such ranges it is sparse. +/// - Within such ranges, we may calculate distances using simple subtraction of field6. +fn is_contiguous_range(range: &Range) -> bool { + range.start.field1 == range.end.field1 + && range.start.field2 == range.end.field2 + && range.start.field3 == range.end.field3 + && range.start.field4 == range.end.field4 + && (range.start.field5 == range.end.field5 + || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5)) +} + +impl<'a> ShardedRange<'a> { + pub fn new(range: Range, shard_identity: &'a ShardIdentity) -> Self { + Self { + shard_identity, + range, + } + } + + /// Break up this range into chunks, each of which has at least one local key in it if the + /// total range has at least one local key. + pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range)> { + // Optimization for single-key case (e.g. logical size keys) + if self.range.end == self.range.start.add(1) { + return vec![( + if self.shard_identity.is_key_disposable(&self.range.start) { + 0 + } else { + 1 + }, + self.range, + )]; + } + + if !is_contiguous_range(&self.range) { + // Ranges that span relations are not fragmented. We only get these ranges as a result + // of operations that act on existing layers, so we trust that the existing range is + // reasonably small. + return vec![(u32::MAX, self.range)]; + } + + let mut fragments: Vec<(u32, Range)> = Vec::new(); + + let mut cursor = self.range.start; + while cursor < self.range.end { + let advance_by = self.distance_to_next_boundary(cursor); + let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor); + + // If the previous fragment is undersized, then we seek to consume enough + // blocks to complete it. + let (want_blocks, merge_last_fragment) = match fragments.last_mut() { + Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)), + Some(frag) => { + // Prev block is complete, want the full number. + ( + target_nblocks, + if is_fragment_disposable { + // If this current range will be empty (not shard-local data), we will merge into previous + Some(frag) + } else { + None + }, + ) + } + None => { + // First iteration, want the full number + (target_nblocks, None) + } + }; + + let advance_by = if is_fragment_disposable { + advance_by + } else { + std::cmp::min(advance_by, want_blocks) + }; + + let next_cursor = cursor.add(advance_by); + + let this_frag = ( + if is_fragment_disposable { + 0 + } else { + advance_by + }, + cursor..next_cursor, + ); + cursor = next_cursor; + + if let Some(last_fragment) = merge_last_fragment { + // Previous fragment was short or this one is empty, merge into it + last_fragment.0 += this_frag.0; + last_fragment.1.end = this_frag.1.end; + } else { + fragments.push(this_frag); + } + } + + fragments + } + + /// Estimate the physical pages that are within this range, on this shard. This returns + /// u32::MAX if the range spans relations: this return value should be interpreted as "large". + pub fn page_count(&self) -> u32 { + // Special cases for single keys like logical sizes + if self.range.end == self.range.start.add(1) { + return if self.shard_identity.is_key_disposable(&self.range.start) { + 0 + } else { + 1 + }; + } + + // We can only do an authentic calculation of contiguous key ranges + if !is_contiguous_range(&self.range) { + return u32::MAX; + } + + // Special case for single sharded tenants: our logical and physical sizes are the same + if self.shard_identity.count < ShardCount::new(2) { + return contiguous_range_len(&self.range); + } + + // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs + // to Self, and add the stripe's block count to our total if so. + let mut result: u64 = 0; + let mut cursor = self.range.start; + while cursor < self.range.end { + // Count up to the next stripe_size boundary or end of range + let advance_by = self.distance_to_next_boundary(cursor); + + // If this blocks in this stripe belong to us, add them to our count + if !self.shard_identity.is_key_disposable(&cursor) { + result += advance_by as u64; + } + + cursor = cursor.add(advance_by); + } + + if result > u32::MAX as u64 { + u32::MAX + } else { + result as u32 + } + } + + /// Advance the cursor to the next potential fragment boundary: this is either + /// a stripe boundary, or the end of the range. + fn distance_to_next_boundary(&self, cursor: Key) -> u32 { + let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end)); + + if self.shard_identity.count < ShardCount::new(2) { + // Optimization: don't bother stepping through stripes if the tenant isn't sharded. + return distance_to_range_end; + } + + if cursor.field6 == 0xffffffff { + // We are wrapping from one relation's logical size to the next relation's first data block + return 1; + } + + let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0; + let stripe_remainder = self.shard_identity.stripe_size.0 + - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0); + + if cfg!(debug_assertions) { + // We should never overflow field5 and field6 -- our callers check this earlier + // and would have returned their u32::MAX cases if the input range violated this. + let next_cursor = cursor.add(stripe_remainder); + debug_assert!( + next_cursor.field1 == cursor.field1 + && next_cursor.field2 == cursor.field2 + && next_cursor.field3 == cursor.field3 + && next_cursor.field4 == cursor.field4 + && next_cursor.field5 == cursor.field5 + ) + } + + std::cmp::min(stripe_remainder, distance_to_range_end) + } + + /// Whereas `page_count` estimates the number of pages physically in this range on this shard, + /// this function simply calculates the number of pages in the space, without accounting for those + /// pages that would not actually be stored on this node. /// + /// Don't use this function in code that works with physical entities like layer files. + pub fn raw_size(range: &Range) -> u32 { + if is_contiguous_range(range) { + contiguous_range_len(range) + } else { + u32::MAX + } + } +} + +impl KeySpace { + /// Create a key space with a single range. + pub fn single(key_range: Range) -> Self { + Self { + ranges: vec![key_range], + } + } + /// Partition a key space into roughly chunks of roughly 'target_size' bytes /// in each partition. /// - pub fn partition(&self, target_size: u64) -> KeyPartitioning { + pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning { // Assume that each value is 8k in size. - let target_nblocks = (target_size / BLCKSZ as u64) as usize; + let target_nblocks = (target_size / BLCKSZ as u64) as u32; let mut parts = Vec::new(); let mut current_part = Vec::new(); let mut current_part_size: usize = 0; for range in &self.ranges { - // If appending the next contiguous range in the keyspace to the current - // partition would cause it to be too large, start a new partition. - let this_size = key_range_size(range) as usize; - if current_part_size + this_size > target_nblocks && !current_part.is_empty() { - parts.push(KeySpace { - ranges: current_part, - }); - current_part = Vec::new(); - current_part_size = 0; - } + // While doing partitioning, wrap the range in ShardedRange so that our size calculations + // will respect shard striping rather than assuming all keys within a range are present. + let range = ShardedRange::new(range.clone(), shard_identity); - // If the next range is larger than 'target_size', split it into - // 'target_size' chunks. - let mut remain_size = this_size; - let mut start = range.start; - while remain_size > target_nblocks { - let next = start.add(target_nblocks as u32); - parts.push(KeySpace { - ranges: vec![start..next], - }); - start = next; - remain_size -= target_nblocks + // Chunk up the range into parts that each contain up to target_size local blocks + for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) { + // If appending the next contiguous range in the keyspace to the current + // partition would cause it to be too large, and our current partition + // covers at least one block that is physically present in this shard, + // then start a new partition + if current_part_size + frag_on_shard_size as usize > target_nblocks as usize + && current_part_size > 0 + { + parts.push(KeySpace { + ranges: current_part, + }); + current_part = Vec::new(); + current_part_size = 0; + } + current_part.push(frag_range.start..frag_range.end); + current_part_size += frag_on_shard_size as usize; } - current_part.push(start..range.end); - current_part_size += remain_size; } // add last partition that wasn't full yet. @@ -63,16 +302,128 @@ impl KeySpace { KeyPartitioning { parts } } + pub fn is_empty(&self) -> bool { + self.total_raw_size() == 0 + } + + /// Merge another keyspace into the current one. + /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`. + pub fn merge(&mut self, other: &KeySpace) { + let all_ranges = self + .ranges + .iter() + .merge_by(other.ranges.iter(), |lhs, rhs| lhs.start < rhs.start); + + let mut accum = KeySpaceAccum::new(); + let mut prev: Option<&Range> = None; + for range in all_ranges { + if let Some(prev) = prev { + let overlap = + std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end); + assert!( + !overlap, + "Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}", + prev, range + ); + } + + accum.add_range(range.clone()); + prev = Some(range); + } + + self.ranges = accum.to_keyspace().ranges; + } + + /// Remove all keys in `other` from `self`. + /// This can involve splitting or removing of existing ranges. + /// Returns the removed keyspace + pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace { + let (self_start, self_end) = match (self.start(), self.end()) { + (Some(start), Some(end)) => (start, end), + _ => { + // self is empty + return KeySpace::default(); + } + }; + + // Key spaces are sorted by definition, so skip ahead to the first + // potentially intersecting range. Similarly, ignore ranges that start + // after the current keyspace ends. + let other_ranges = other + .ranges + .iter() + .skip_while(|range| self_start >= range.end) + .take_while(|range| self_end > range.start); + + let mut removed_accum = KeySpaceRandomAccum::new(); + for range in other_ranges { + while let Some(overlap_at) = self.overlaps_at(range) { + let overlapped = self.ranges[overlap_at].clone(); + + if overlapped.start < range.start && overlapped.end <= range.end { + // Higher part of the range is completely overlapped. + removed_accum.add_range(range.start..self.ranges[overlap_at].end); + self.ranges[overlap_at].end = range.start; + } + if overlapped.start >= range.start && overlapped.end > range.end { + // Lower part of the range is completely overlapped. + removed_accum.add_range(self.ranges[overlap_at].start..range.end); + self.ranges[overlap_at].start = range.end; + } + if overlapped.start < range.start && overlapped.end > range.end { + // Middle part of the range is overlapped. + removed_accum.add_range(range.clone()); + self.ranges[overlap_at].end = range.start; + self.ranges + .insert(overlap_at + 1, range.end..overlapped.end); + } + if overlapped.start >= range.start && overlapped.end <= range.end { + // Whole range is overlapped + removed_accum.add_range(self.ranges[overlap_at].clone()); + self.ranges.remove(overlap_at); + } + } + } + + removed_accum.to_keyspace() + } + + pub fn start(&self) -> Option { + self.ranges.first().map(|range| range.start) + } + + pub fn end(&self) -> Option { + self.ranges.last().map(|range| range.end) + } + + /// The size of the keyspace in pages, before accounting for sharding + pub fn total_raw_size(&self) -> usize { + self.ranges + .iter() + .map(|range| ShardedRange::raw_size(range) as usize) + .sum() + } + + fn overlaps_at(&self, range: &Range) -> Option { + match self.ranges.binary_search_by_key(&range.end, |r| r.start) { + Ok(0) => None, + Err(0) => None, + Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1), + Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1), + _ => None, + } + } + /// /// Check if key space contains overlapping range /// pub fn overlaps(&self, range: &Range) -> bool { - match self.ranges.binary_search_by_key(&range.end, |r| r.start) { - Ok(0) => false, - Err(0) => false, - Ok(index) => self.ranges[index - 1].end > range.start, - Err(index) => self.ranges[index - 1].end > range.start, - } + self.overlaps_at(range).is_some() + } + + /// Check if the keyspace contains a key + pub fn contains(&self, key: &Key) -> bool { + self.overlaps(&(*key..key.next())) } } @@ -88,10 +439,33 @@ pub struct KeyPartitioning { pub parts: Vec, } +/// Represents a partitioning of the sparse key space. +#[derive(Clone, Debug, Default)] +pub struct SparseKeyPartitioning { + pub parts: Vec, +} + impl KeyPartitioning { pub fn new() -> Self { KeyPartitioning { parts: Vec::new() } } + + /// Convert a key partitioning to a sparse partition. + pub fn into_sparse(self) -> SparseKeyPartitioning { + SparseKeyPartitioning { + parts: self.parts.into_iter().map(SparseKeySpace).collect(), + } + } +} + +impl SparseKeyPartitioning { + /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will + /// cause long/dead loops. + pub fn into_dense(self) -> KeyPartitioning { + KeyPartitioning { + parts: self.parts.into_iter().map(|x| x.0).collect(), + } + } } /// @@ -104,6 +478,7 @@ pub struct KeySpaceAccum { accum: Option>, ranges: Vec>, + size: u64, } impl KeySpaceAccum { @@ -111,14 +486,19 @@ impl KeySpaceAccum { Self { accum: None, ranges: Vec::new(), + size: 0, } } + #[inline(always)] pub fn add_key(&mut self, key: Key) { self.add_range(singleton_range(key)) } + #[inline(always)] pub fn add_range(&mut self, range: Range) { + self.size += ShardedRange::raw_size(&range) as u64; + match self.accum.as_mut() { Some(accum) => { if range.start == accum.end { @@ -144,6 +524,16 @@ impl KeySpaceAccum { ranges: self.ranges, } } + + pub fn consume_keyspace(&mut self) -> KeySpace { + std::mem::take(self).to_keyspace() + } + + // The total number of keys in this object, ignoring any sharding effects that might cause some of + // the keys to be omitted in storage on this shard. + pub fn raw_size(&self) -> u64 { + self.size + } } /// @@ -168,6 +558,12 @@ impl KeySpaceRandomAccum { self.ranges.push(range); } + pub fn add_keyspace(&mut self, keyspace: KeySpace) { + for range in keyspace.ranges { + self.add_range(range); + } + } + pub fn to_keyspace(mut self) -> KeySpace { let mut ranges = Vec::new(); if !self.ranges.is_empty() { @@ -188,28 +584,12 @@ impl KeySpaceRandomAccum { } KeySpace { ranges } } -} -pub fn key_range_size(key_range: &Range) -> u32 { - let start = key_range.start; - let end = key_range.end; + pub fn consume_keyspace(&mut self) -> KeySpace { + let mut prev_accum = KeySpaceRandomAccum::new(); + std::mem::swap(self, &mut prev_accum); - if end.field1 != start.field1 - || end.field2 != start.field2 - || end.field3 != start.field3 - || end.field4 != start.field4 - { - return u32::MAX; - } - - let start = (start.field5 as u64) << 32 | start.field6 as u64; - let end = (end.field5 as u64) << 32 | end.field6 as u64; - - let diff = end - start; - if diff > u32::MAX as u64 { - u32::MAX - } else { - diff as u32 + prev_accum.to_keyspace() } } @@ -219,6 +599,13 @@ pub fn singleton_range(key: Key) -> Range { #[cfg(test)] mod tests { + use rand::{RngCore, SeedableRng}; + + use crate::{ + models::ShardParameters, + shard::{ShardCount, ShardNumber}, + }; + use super::*; use std::fmt::Write; @@ -252,6 +639,33 @@ mod tests { } } + #[test] + fn keyspace_consume() { + let ranges = vec![kr(0..10), kr(20..35), kr(40..45)]; + + let mut accum = KeySpaceAccum::new(); + for range in &ranges { + accum.add_range(range.clone()); + } + + let expected_size: u64 = ranges + .iter() + .map(|r| ShardedRange::raw_size(r) as u64) + .sum(); + assert_eq!(accum.raw_size(), expected_size); + + assert_ks_eq(&accum.consume_keyspace(), ranges.clone()); + assert_eq!(accum.raw_size(), 0); + + assert_ks_eq(&accum.consume_keyspace(), vec![]); + assert_eq!(accum.raw_size(), 0); + + for range in &ranges { + accum.add_range(range.clone()); + } + assert_ks_eq(&accum.to_keyspace(), ranges); + } + #[test] fn keyspace_add_range() { // two separate ranges @@ -394,4 +808,559 @@ mod tests { // xxxxxxxxxxx assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently! } + + #[test] + fn test_remove_full_overlapps() { + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(4), + Key::from_i128(5)..Key::from_i128(8), + Key::from_i128(10)..Key::from_i128(12), + ], + }; + let key_space2 = KeySpace { + ranges: vec![ + Key::from_i128(2)..Key::from_i128(3), + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(13), + ], + }; + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(2)..Key::from_i128(3), + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(12), + ], + }; + assert_eq!(removed, removed_expected); + + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(2), + Key::from_i128(3)..Key::from_i128(4), + Key::from_i128(5)..Key::from_i128(6), + Key::from_i128(7)..Key::from_i128(8), + Key::from_i128(10)..Key::from_i128(11) + ] + ); + } + + #[test] + fn test_remove_partial_overlaps() { + // Test partial ovelaps + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(5), + Key::from_i128(7)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + ], + }; + let key_space2 = KeySpace { + ranges: vec![ + Key::from_i128(3)..Key::from_i128(6), + Key::from_i128(8)..Key::from_i128(11), + Key::from_i128(14)..Key::from_i128(17), + ], + }; + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(3)..Key::from_i128(5), + Key::from_i128(8)..Key::from_i128(10), + Key::from_i128(14)..Key::from_i128(15), + ], + }; + assert_eq!(removed, removed_expected); + + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(3), + Key::from_i128(7)..Key::from_i128(8), + Key::from_i128(12)..Key::from_i128(14), + ] + ); + } + + #[test] + fn test_remove_no_overlaps() { + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(5), + Key::from_i128(7)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + ], + }; + let key_space2 = KeySpace { + ranges: vec![ + Key::from_i128(6)..Key::from_i128(7), + Key::from_i128(11)..Key::from_i128(12), + Key::from_i128(15)..Key::from_i128(17), + ], + }; + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace::default(); + assert_eq!(removed, removed_expected); + + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(5), + Key::from_i128(7)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + ] + ); + } + + #[test] + fn test_remove_one_range_overlaps_multiple() { + let mut key_space1 = KeySpace { + ranges: vec![ + Key::from_i128(1)..Key::from_i128(3), + Key::from_i128(3)..Key::from_i128(6), + Key::from_i128(6)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + Key::from_i128(17)..Key::from_i128(20), + Key::from_i128(20)..Key::from_i128(30), + Key::from_i128(30)..Key::from_i128(40), + ], + }; + let key_space2 = KeySpace { + ranges: vec![Key::from_i128(9)..Key::from_i128(19)], + }; + + let removed = key_space1.remove_overlapping_with(&key_space2); + let removed_expected = KeySpace { + ranges: vec![ + Key::from_i128(9)..Key::from_i128(10), + Key::from_i128(12)..Key::from_i128(15), + Key::from_i128(17)..Key::from_i128(19), + ], + }; + assert_eq!(removed, removed_expected); + + assert_eq!( + key_space1.ranges, + vec![ + Key::from_i128(1)..Key::from_i128(3), + Key::from_i128(3)..Key::from_i128(6), + Key::from_i128(6)..Key::from_i128(9), + Key::from_i128(19)..Key::from_i128(20), + Key::from_i128(20)..Key::from_i128(30), + Key::from_i128(30)..Key::from_i128(40), + ] + ); + } + #[test] + fn sharded_range_relation_gap() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067F00000005000040100300000000").unwrap(), + end: Key::from_hex("000000067F00000005000040130000004000").unwrap(), + }, + &shard_identity, + ); + + // Key range spans relations, expect MAX + assert_eq!(range.page_count(), u32::MAX); + } + + #[test] + fn shard_identity_keyspaces_single_key() { + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(), + end: Key::from_hex("000000067f00000001000000700100000000").unwrap(), + }, + &shard_identity, + ); + // Single-key range on logical size key + assert_eq!(range.page_count(), 1); + } + + /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation + #[test] + fn contiguous_range_check() { + assert!(!is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000003").unwrap()) + ),); + + // The ranges goes all the way up to the 0xffffffff, including it: this is + // not considered a rel block range because 0xffffffff stores logical sizes, + // not blocks. + assert!(!is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000000").unwrap()) + ),); + + // Keys within the normal data region of a relation + assert!(is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df0000000000").unwrap() + ..Key::from_hex("000000067f00000001000004df0000000080").unwrap()) + ),); + + // The logical size key of one forkno, then some blocks in the next + assert!(is_contiguous_range( + &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap() + ..Key::from_hex("000000067f00000001000004df0100000080").unwrap()) + ),); + } + + #[test] + fn shard_identity_keyspaces_forkno_gap() { + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(), + end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(), + }, + &shard_identity, + ); + + // Range spanning the end of one forkno and the start of the next: we do not attempt to + // calculate a valid size, because we have no way to know if they keys between start + // and end are actually in use. + assert_eq!(range.page_count(), u32::MAX); + } + + #[test] + fn shard_identity_keyspaces_one_relation() { + for shard_number in 0..4 { + let shard_identity = ShardIdentity::new( + ShardNumber(shard_number), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + let range = ShardedRange::new( + Range { + start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(), + end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(), + }, + &shard_identity, + ); + + // Very simple case: range covering block zero of one relation, where that block maps to shard zero + if shard_number == 0 { + assert_eq!(range.page_count(), 1); + } else { + // Other shards should perceive the range's size as zero + assert_eq!(range.page_count(), 0); + } + } + } + + /// Test helper: construct a ShardedRange and call fragment() on it, returning + /// the total page count in the range and the fragments. + fn do_fragment( + range_start: Key, + range_end: Key, + shard_identity: &ShardIdentity, + target_nblocks: u32, + ) -> (u32, Vec<(u32, Range)>) { + let range = ShardedRange::new( + Range { + start: range_start, + end: range_end, + }, + shard_identity, + ); + + let page_count = range.page_count(); + let fragments = range.fragment(target_nblocks); + + // Invariant: we always get at least one fragment + assert!(!fragments.is_empty()); + + // Invariant: the first/last fragment start/end should equal the input start/end + assert_eq!(fragments.first().unwrap().1.start, range_start); + assert_eq!(fragments.last().unwrap().1.end, range_end); + + if page_count > 0 { + // Invariant: every fragment must contain at least one shard-local page, if the + // total range contains at least one shard-local page + let all_nonzero = fragments.iter().all(|f| f.0 > 0); + if !all_nonzero { + eprintln!("Found a zero-length fragment: {:?}", fragments); + } + assert!(all_nonzero); + } else { + // A range with no shard-local pages should always be returned as a single fragment + assert_eq!(fragments, vec![(0, range_start..range_end)]); + } + + // Invariant: fragments must be ordered and non-overlapping + let mut last: Option> = None; + for frag in &fragments { + if let Some(last) = last { + assert!(frag.1.start >= last.end); + assert!(frag.1.start > last.start); + } + last = Some(frag.1.clone()) + } + + // Invariant: fragments respect target_nblocks + for frag in &fragments { + assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks); + } + + (page_count, fragments) + } + + /// Really simple tests for fragment(), on a range that just contains a single stripe + /// for a single tenant. + #[test] + fn sharded_range_fragment_simple() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + // A range which we happen to know covers exactly one stripe which belongs to this shard + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap(); + + // Ask for stripe_size blocks, we get the whole stripe + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 32768), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for more, we still get the whole stripe + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 10000000), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for target_nblocks of half the stripe size, we get two halves + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16384), + ( + 32768, + vec![ + (16384, input_start..input_start.add(16384)), + (16384, input_start.add(16384)..input_end) + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_multi_stripe() { + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + + // A range which covers multiple stripes, exactly one of which belongs to the current shard. + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + // Ask for all the blocks, get a fragment that covers the whole range but reports + // its size to be just the blocks belonging to our shard. + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 131072), + (32768, vec![(32768, input_start..input_end)]) + ); + + // Ask for a sub-stripe quantity + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16000), + ( + 32768, + vec![ + (16000, input_start..input_start.add(16000)), + (16000, input_start.add(16000)..input_start.add(32000)), + (768, input_start.add(32000)..input_end), + ] + ) + ); + + // Try on a range that starts slightly after our owned stripe + assert_eq!( + do_fragment(input_start.add(1), input_end, &shard_identity, 131072), + (32767, vec![(32767, input_start.add(1)..input_end)]) + ); + } + + /// Test our calculations work correctly when we start a range from the logical size key of + /// a previous relation. + #[test] + fn sharded_range_fragment_starting_from_logical_size() { + let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap(); + + // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x10000), + (0x8001, vec![(0x8001, input_start..input_end)]) + ); + + // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards + // store all logical sizes) + let shard_identity = ShardIdentity::new( + ShardNumber(1), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x10000), + (0x1, vec![(0x1, input_start..input_end)]) + ); + } + + /// Test that ShardedRange behaves properly when used on un-sharded data + #[test] + fn sharded_range_fragment_unsharded() { + let shard_identity = ShardIdentity::unsharded(); + + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + ( + 0x10000, + vec![ + (0x8000, input_start..input_start.add(0x8000)), + (0x8000, input_start.add(0x8000)..input_start.add(0x10000)) + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_cross_relation() { + let shard_identity = ShardIdentity::unsharded(); + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + (u32::MAX, vec![(u32::MAX, input_start..input_end),]) + ); + + // Same, but using a sharded identity + let shard_identity = ShardIdentity::new( + ShardNumber(0), + ShardCount::new(4), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 0x8000), + (u32::MAX, vec![(u32::MAX, input_start..input_end),]) + ); + } + + #[test] + fn sharded_range_fragment_tiny_nblocks() { + let shard_identity = ShardIdentity::unsharded(); + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap(); + let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap(); + assert_eq!( + do_fragment(input_start, input_end, &shard_identity, 16), + ( + 0x38, + vec![ + (16, input_start..input_start.add(16)), + (16, input_start.add(16)..input_start.add(32)), + (16, input_start.add(32)..input_start.add(48)), + (8, input_start.add(48)..input_end), + ] + ) + ); + } + + #[test] + fn sharded_range_fragment_fuzz() { + // Use a fixed seed: we don't want to explicitly pick values, but we do want + // the test to be reproducible. + let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef); + + for _i in 0..1000 { + let shard_identity = if prng.next_u32() % 2 == 0 { + ShardIdentity::unsharded() + } else { + let shard_count = prng.next_u32() % 127 + 1; + ShardIdentity::new( + ShardNumber((prng.next_u32() % shard_count) as u8), + ShardCount::new(shard_count as u8), + ShardParameters::DEFAULT_STRIPE_SIZE, + ) + .unwrap() + }; + + let target_nblocks = prng.next_u32() % 65536 + 1; + + let start_offset = prng.next_u32() % 16384; + + // Try ranges up to 4GiB in size, that are always at least 1 + let range_size = prng.next_u32() % 8192 + 1; + + // A range that spans relations: expect fragmentation to give up and return a u32::MAX size + let input_start = Key::from_hex("000000067F00000001000004E10000000000") + .unwrap() + .add(start_offset); + let input_end = input_start.add(range_size); + + // This test's main success conditions are the invariants baked into do_fragment + let (_total_size, fragments) = + do_fragment(input_start, input_end, &shard_identity, target_nblocks); + + // Pick a random key within the range and check it appears in the output + let example_key = input_start.add(prng.next_u32() % range_size); + + // Panic on unwrap if it isn't found + let example_key_frag = fragments + .iter() + .find(|f| f.1.contains(&example_key)) + .unwrap(); + + // Check that the fragment containing our random key has a nonzero size if + // that key is shard-local + let example_key_local = !shard_identity.is_key_disposable(&example_key); + if example_key_local { + assert!(example_key_frag.0 > 0); + } + } + } } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index b236b93428..532185a366 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -1,16 +1,13 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use const_format::formatcp; -/// Public API types -pub mod control_api; +pub mod controller_api; pub mod key; pub mod keyspace; pub mod models; pub mod reltag; pub mod shard; +/// Public API types +pub mod upcall_api; -pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; -pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); -pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; -pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +pub mod config; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 316d79b634..70db0b7344 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1,24 +1,35 @@ +pub mod detach_ancestor; pub mod partitioning; +pub mod utilization; + +pub use utilization::PageserverUtilization; use std::{ + borrow::Cow, collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, - time::SystemTime, + sync::atomic::AtomicUsize, + time::{Duration, SystemTime}, }; use byteorder::{BigEndian, ReadBytesExt}; +use postgres_ffi::BLCKSZ; use serde::{Deserialize, Serialize}; use serde_with::serde_as; -use strum_macros; use utils::{ completion, history_buffer::HistoryBufferWithDropCounter, id::{NodeId, TenantId, TimelineId}, lsn::Lsn, + serde_system_time, }; -use crate::{reltag::RelTag, shard::TenantShardId}; +use crate::controller_api::PlacementPolicy; +use crate::{ + reltag::RelTag, + shard::{ShardCount, ShardStripeSize, TenantShardId}, +}; use anyhow::bail; use bytes::{Buf, BufMut, Bytes, BytesMut}; @@ -150,6 +161,36 @@ impl std::fmt::Debug for TenantState { } } +/// A temporary lease to a specific lsn inside a timeline. +/// Access to the lsn is guaranteed by the pageserver until the expiration indicated by `valid_until`. +#[serde_as] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct LsnLease { + #[serde_as(as = "SystemTimeAsRfc3339Millis")] + pub valid_until: SystemTime, +} + +serde_with::serde_conv!( + SystemTimeAsRfc3339Millis, + SystemTime, + |time: &SystemTime| humantime::format_rfc3339_millis(*time).to_string(), + |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) } +); + +impl LsnLease { + /// The default length for an explicit LSN lease request (10 minutes). + pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60); + + /// The default length for an implicit LSN lease granted during + /// `get_lsn_by_timestamp` request (1 minutes). + pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60); + + /// Checks whether the lease is expired. + pub fn is_expired(&self, now: &SystemTime) -> bool { + now > &self.valid_until + } +} + /// The only [`TenantState`] variants we could be `TenantState::Activating` from. #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub enum ActivatingFrom { @@ -176,7 +217,7 @@ pub enum TimelineState { Broken { reason: String, backtrace: String }, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct TimelineCreateRequest { pub new_timeline_id: TimelineId, #[serde(default)] @@ -188,6 +229,48 @@ pub struct TimelineCreateRequest { pub pg_version: Option, } +#[derive(Serialize, Deserialize)] +pub struct TenantShardSplitRequest { + pub new_shard_count: u8, + + // A tenant's stripe size is only meaningful the first time their shard count goes + // above 1: therefore during a split from 1->N shards, we may modify the stripe size. + // + // If this is set while the stripe count is being increased from an already >1 value, + // then the request will fail with 400. + pub new_stripe_size: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct TenantShardSplitResponse { + pub new_shards: Vec, +} + +/// Parameters that apply to all shards in a tenant. Used during tenant creation. +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct ShardParameters { + pub count: ShardCount, + pub stripe_size: ShardStripeSize, +} + +impl ShardParameters { + pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); + + pub fn is_unsharded(&self) -> bool { + self.count.is_unsharded() + } +} + +impl Default for ShardParameters { + fn default() -> Self { + Self { + count: ShardCount::new(0), + stripe_size: Self::DEFAULT_STRIPE_SIZE, + } + } +} + #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantCreateRequest { @@ -195,6 +278,17 @@ pub struct TenantCreateRequest { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub generation: Option, + + // If omitted, create a single shard with TenantShardId::unsharded() + #[serde(default)] + #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] + pub shard_parameters: ShardParameters, + + // This parameter is only meaningful in requests sent to the storage controller + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub placement_policy: Option, + #[serde(flatten)] pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it } @@ -217,13 +311,15 @@ impl std::ops::Deref for TenantCreateRequest { /// An alternative representation of `pageserver::tenant::TenantConf` with /// simpler types. -#[derive(Serialize, Deserialize, Debug, Default)] +#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] pub struct TenantConfig { pub checkpoint_distance: Option, pub checkpoint_timeout: Option, pub compaction_target_size: Option, pub compaction_period: Option, pub compaction_threshold: Option, + // defer parsing compaction_algorithm, like eviction_policy + pub compaction_algorithm: Option, pub gc_horizon: Option, pub gc_period: Option, pub image_creation_threshold: Option, @@ -232,21 +328,195 @@ pub struct TenantConfig { pub lagging_wal_timeout: Option, pub max_lsn_wal_lag: Option, pub trace_read_requests: Option, - // We defer the parsing of the eviction_policy field to the request handler. - // Otherwise we'd have to move the types for eviction policy into this package. - // We might do that once the eviction feature has stabilizied. - // For now, this field is not even documented in the openapi_spec.yml. - pub eviction_policy: Option, + pub eviction_policy: Option, pub min_resident_size_override: Option, pub evictions_low_residence_duration_metric_threshold: Option, - pub gc_feedback: Option, pub heatmap_period: Option, + pub lazy_slru_download: Option, + pub timeline_get_throttle: Option, + pub image_layer_creation_check_threshold: Option, + pub switch_aux_file_policy: Option, + pub lsn_lease_length: Option, + pub lsn_lease_length_for_ts: Option, +} + +/// The policy for the aux file storage. It can be switched through `switch_aux_file_policy` +/// tenant config. When the first aux file written, the policy will be persisted in the +/// `index_part.json` file and has a limited migration path. +/// +/// Currently, we only allow the following migration path: +/// +/// Unset -> V1 +/// -> V2 +/// -> CrossValidation -> V2 +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum AuxFilePolicy { + /// V1 aux file policy: store everything in AUX_FILE_KEY + #[strum(ascii_case_insensitive)] + V1, + /// V2 aux file policy: store in the AUX_FILE keyspace + #[strum(ascii_case_insensitive)] + V2, + /// Cross validation runs both formats on the write path and does validation + /// on the read path. + #[strum(ascii_case_insensitive)] + CrossValidation, +} + +impl AuxFilePolicy { + pub fn is_valid_migration_path(from: Option, to: Self) -> bool { + matches!( + (from, to), + (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2) + ) + } + + /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used. + pub fn default_tenant_config() -> Self { + Self::V1 + } +} + +/// The aux file policy memory flag. Users can store `Option` into this atomic flag. 0 == unspecified. +pub struct AtomicAuxFilePolicy(AtomicUsize); + +impl AtomicAuxFilePolicy { + pub fn new(policy: Option) -> Self { + Self(AtomicUsize::new( + policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), + )) + } + + pub fn load(&self) -> Option { + match self.0.load(std::sync::atomic::Ordering::Acquire) { + 0 => None, + other => Some(AuxFilePolicy::from_usize(other)), + } + } + + pub fn store(&self, policy: Option) { + self.0.store( + policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), + std::sync::atomic::Ordering::Release, + ); + } +} + +impl AuxFilePolicy { + pub fn to_usize(self) -> usize { + match self { + Self::V1 => 1, + Self::CrossValidation => 2, + Self::V2 => 3, + } + } + + pub fn try_from_usize(this: usize) -> Option { + match this { + 1 => Some(Self::V1), + 2 => Some(Self::CrossValidation), + 3 => Some(Self::V2), + _ => None, + } + } + + pub fn from_usize(this: usize) -> Self { + Self::try_from_usize(this).unwrap() + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "kind")] +pub enum EvictionPolicy { + NoEviction, + LayerAccessThreshold(EvictionPolicyLayerAccessThreshold), + OnlyImitiate(EvictionPolicyLayerAccessThreshold), +} + +impl EvictionPolicy { + pub fn discriminant_str(&self) -> &'static str { + match self { + EvictionPolicy::NoEviction => "NoEviction", + EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold", + EvictionPolicy::OnlyImitiate(_) => "OnlyImitiate", + } + } +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum CompactionAlgorithm { + Legacy, + Tiered, +} + +#[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] +pub struct CompactionAlgorithmSettings { + pub kind: CompactionAlgorithm, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvictionPolicyLayerAccessThreshold { + #[serde(with = "humantime_serde")] + pub period: Duration, + #[serde(with = "humantime_serde")] + pub threshold: Duration, +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub struct ThrottleConfig { + pub task_kinds: Vec, // TaskKind + pub initial: usize, + #[serde(with = "humantime_serde")] + pub refill_interval: Duration, + pub refill_amount: NonZeroUsize, + pub max: usize, + pub fair: bool, +} + +impl ThrottleConfig { + pub fn disabled() -> Self { + Self { + task_kinds: vec![], // effectively disables the throttle + // other values don't matter with emtpy `task_kinds`. + initial: 0, + refill_interval: Duration::from_millis(1), + refill_amount: NonZeroUsize::new(1).unwrap(), + max: 1, + fair: true, + } + } + /// The requests per second allowed by the given config. + pub fn steady_rps(&self) -> f64 { + (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) + } } /// A flattened analog of a `pagesever::tenant::LocationMode`, which /// lists out all possible states (and the virtual "Detached" state) /// in a flat form rather than using rust-style enums. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)] pub enum LocationConfigMode { AttachedSingle, AttachedMulti, @@ -255,19 +525,21 @@ pub enum LocationConfigMode { Detached, } -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] pub struct LocationConfigSecondary { pub warm: bool, } /// An alternative representation of `pageserver::tenant::LocationConf`, /// for use in external-facing APIs. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] pub struct LocationConfig { pub mode: LocationConfigMode, /// If attaching, in what generation? #[serde(default)] pub generation: Option, + + // If requesting mode `Secondary`, configuration for that. #[serde(default)] pub secondary_conf: Option, @@ -280,11 +552,17 @@ pub struct LocationConfig { #[serde(default)] pub shard_stripe_size: u32, - // If requesting mode `Secondary`, configuration for that. - // Custom storage configuration for the tenant, if any + // This configuration only affects attached mode, but should be provided irrespective + // of the mode, as a secondary location might transition on startup if the response + // to the `/re-attach` control plane API requests it. pub tenant_conf: TenantConfig, } +#[derive(Serialize, Deserialize)] +pub struct LocationConfigListResponse { + pub tenant_shards: Vec<(TenantShardId, Option)>, +} + #[derive(Serialize, Deserialize)] #[serde(transparent)] pub struct TenantCreateResponse(pub TenantId); @@ -297,11 +575,31 @@ pub struct StatusResponse { #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantLocationConfigRequest { - pub tenant_id: TenantId, #[serde(flatten)] pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it } +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantTimeTravelRequest { + pub shard_counts: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantShardLocation { + pub shard_id: TenantShardId, + pub node_id: NodeId, +} + +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantLocationConfigResponse { + pub shards: Vec, + // If the shards' ShardCount count is >1, stripe_size will be set. + pub stripe_size: Option, +} + #[derive(Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct TenantConfigRequest { @@ -368,6 +666,8 @@ pub struct TenantInfo { /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // physical size is only included in `tenant_status` endpoint pub attachment_status: TenantAttachmentStatus, + #[serde(skip_serializing_if = "Option::is_none")] + pub generation: Option, } #[derive(Serialize, Deserialize, Clone)] @@ -375,6 +675,8 @@ pub struct TenantDetails { #[serde(flatten)] pub tenant_info: TenantInfo, + pub walredo: Option, + pub timelines: Vec, } @@ -403,6 +705,8 @@ pub struct TimelineInfo { pub current_logical_size: u64, pub current_logical_size_is_accurate: bool, + pub directory_entries_counts: Vec, + /// Sum of the size of all layer files. /// If a layer is present in both local FS and S3, it counts only once. pub current_physical_size: Option, // is None when timeline is Unloaded @@ -419,9 +723,12 @@ pub struct TimelineInfo { pub state: TimelineState, pub walreceiver_status: String, + + /// The last aux file policy being used on this timeline + pub last_aux_file_policy: Option, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerMapInfo { pub in_memory_layers: Vec, pub historic_layers: Vec, @@ -439,7 +746,7 @@ pub enum LayerAccessKind { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerAccessStatFullDetails { pub when_millis_since_epoch: u64, - pub task_kind: &'static str, + pub task_kind: Cow<'static, str>, pub access_kind: LayerAccessKind, } @@ -498,23 +805,23 @@ impl LayerResidenceEvent { } } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct LayerAccessStats { pub access_count_by_access_kind: HashMap, - pub task_kind_access_flag: Vec<&'static str>, + pub task_kind_access_flag: Vec>, pub first: Option, pub accesses_history: HistoryBufferWithDropCounter, pub residence_events_history: HistoryBufferWithDropCounter, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum InMemoryLayerInfo { Open { lsn_start: Lsn }, Frozen { lsn_start: Lsn, lsn_end: Lsn }, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum HistoricLayerInfo { Delta { @@ -525,6 +832,8 @@ pub enum HistoricLayerInfo { lsn_end: Lsn, remote: bool, access_stats: LayerAccessStats, + + l0: bool, }, Image { layer_file_name: String, @@ -536,11 +845,57 @@ pub enum HistoricLayerInfo { }, } +impl HistoricLayerInfo { + pub fn layer_file_name(&self) -> &str { + match self { + HistoricLayerInfo::Delta { + layer_file_name, .. + } => layer_file_name, + HistoricLayerInfo::Image { + layer_file_name, .. + } => layer_file_name, + } + } + pub fn is_remote(&self) -> bool { + match self { + HistoricLayerInfo::Delta { remote, .. } => *remote, + HistoricLayerInfo::Image { remote, .. } => *remote, + } + } + pub fn set_remote(&mut self, value: bool) { + let field = match self { + HistoricLayerInfo::Delta { remote, .. } => remote, + HistoricLayerInfo::Image { remote, .. } => remote, + }; + *field = value; + } + pub fn layer_file_size(&self) -> u64 { + match self { + HistoricLayerInfo::Delta { + layer_file_size, .. + } => *layer_file_size, + HistoricLayerInfo::Image { + layer_file_size, .. + } => *layer_file_size, + } + } +} + #[derive(Debug, Serialize, Deserialize)] pub struct DownloadRemoteLayersTaskSpawnRequest { pub max_concurrent_downloads: NonZeroUsize, } +#[derive(Debug, Serialize, Deserialize)] +pub struct IngestAuxFilesRequest { + pub aux_files: HashMap, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ListAuxFilesRequest { + pub lsn: Lsn, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DownloadRemoteLayersTaskInfo { pub task_id: String, @@ -562,6 +917,117 @@ pub struct TimelineGcRequest { pub gc_horizon: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalRedoManagerProcessStatus { + pub pid: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WalRedoManagerStatus { + pub last_redo_at: Option>, + pub process: Option, +} + +/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating +/// a download job, timing out while waiting for it to run, and then inspecting this status to understand +/// what's happening. +#[derive(Default, Debug, Serialize, Deserialize, Clone)] +pub struct SecondaryProgress { + /// The remote storage LastModified time of the heatmap object we last downloaded. + pub heatmap_mtime: Option, + + /// The number of layers currently on-disk + pub layers_downloaded: usize, + /// The number of layers in the most recently seen heatmap + pub layers_total: usize, + + /// The number of layer bytes currently on-disk + pub bytes_downloaded: u64, + /// The number of layer bytes in the most recently seen heatmap + pub bytes_total: u64, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantScanRemoteStorageShard { + pub tenant_shard_id: TenantShardId, + pub generation: Option, +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct TenantScanRemoteStorageResponse { + pub shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "snake_case")] +pub enum TenantSorting { + ResidentSize, + MaxLogicalSize, +} + +impl Default for TenantSorting { + fn default() -> Self { + Self::ResidentSize + } +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct TopTenantShardsRequest { + // How would you like to sort the tenants? + pub order_by: TenantSorting, + + // How many results? + pub limit: usize, + + // Omit tenants with more than this many shards (e.g. if this is the max number of shards + // that the caller would ever split to) + pub where_shards_lt: Option, + + // Omit tenants where the ordering metric is less than this (this is an optimization to + // let us quickly exclude numerous tiny shards) + pub where_gt: Option, +} + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +pub struct TopTenantShardItem { + pub id: TenantShardId, + + /// Total size of layers on local disk for all timelines in this tenant + pub resident_size: u64, + + /// Total size of layers in remote storage for all timelines in this tenant + pub physical_size: u64, + + /// The largest logical size of a timeline within this tenant + pub max_logical_size: u64, +} + +#[derive(Serialize, Deserialize, Debug, Default)] +pub struct TopTenantShardsResponse { + pub shards: Vec, +} + +pub mod virtual_file { + #[derive( + Copy, + Clone, + PartialEq, + Eq, + Hash, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, + Debug, + )] + #[strum(serialize_all = "kebab-case")] + pub enum IoEngineKind { + StdFs, + #[cfg(target_os = "linux")] + TokioEpollUring, + } +} + // Wrapped in libpq CopyData #[derive(PartialEq, Eq, Debug)] pub enum PagestreamFeMessage { @@ -569,6 +1035,7 @@ pub enum PagestreamFeMessage { Nblocks(PagestreamNblocksRequest), GetPage(PagestreamGetPageRequest), DbSize(PagestreamDbSizeRequest), + GetSlruSegment(PagestreamGetSlruSegmentRequest), } // Wrapped in libpq CopyData @@ -579,6 +1046,7 @@ pub enum PagestreamBeMessage { GetPage(PagestreamGetPageResponse), Error(PagestreamErrorResponse), DbSize(PagestreamDbSizeResponse), + GetSlruSegment(PagestreamGetSlruSegmentResponse), } // Keep in sync with `pagestore_client.h` @@ -589,6 +1057,7 @@ enum PagestreamBeMessageTag { GetPage = 102, Error = 103, DbSize = 104, + GetSlruSegment = 105, } impl TryFrom for PagestreamBeMessageTag { type Error = u8; @@ -599,40 +1068,82 @@ impl TryFrom for PagestreamBeMessageTag { 102 => Ok(PagestreamBeMessageTag::GetPage), 103 => Ok(PagestreamBeMessageTag::Error), 104 => Ok(PagestreamBeMessageTag::DbSize), + 105 => Ok(PagestreamBeMessageTag::GetSlruSegment), _ => Err(value), } } } +// In the V2 protocol version, a GetPage request contains two LSN values: +// +// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means +// "get the latest version present". It's used by the primary server, which knows that no one else +// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is +// Lsn::Max. Standby servers use the current replay LSN as the request LSN. +// +// not_modified_since: Hint to the pageserver that the client knows that the page has not been +// modified between 'not_modified_since' and the request LSN. It's always correct to set +// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but +// passing an earlier LSN can speed up the request, by allowing the pageserver to process the +// request without waiting for 'request_lsn' to arrive. +// +// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was +// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and +// 'latest' was set to true. The V2 interface was added because there was no correct way for a +// standby to request a page at a particular non-latest LSN, and also include the +// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the +// request, if the standby knows that the page hasn't been modified since, and risk getting an error +// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could +// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2 +// interface allows sending both LSNs, and let the pageserver do the right thing. There is no +// difference in the responses between V1 and V2. +// +// The Request structs below reflect the V2 interface. If V1 is used, the parse function +// maps the old format requests to the new format. +// +#[derive(Clone, Copy)] +pub enum PagestreamProtocolVersion { + V1, + V2, +} + #[derive(Debug, PartialEq, Eq)] pub struct PagestreamExistsRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamNblocksRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamGetPageRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub rel: RelTag, pub blkno: u32, } #[derive(Debug, PartialEq, Eq)] pub struct PagestreamDbSizeRequest { - pub latest: bool, - pub lsn: Lsn, + pub request_lsn: Lsn, + pub not_modified_since: Lsn, pub dbnode: u32, } +#[derive(Debug, PartialEq, Eq)] +pub struct PagestreamGetSlruSegmentRequest { + pub request_lsn: Lsn, + pub not_modified_since: Lsn, + pub kind: u8, + pub segno: u32, +} + #[derive(Debug)] pub struct PagestreamExistsResponse { pub exists: bool, @@ -648,6 +1159,11 @@ pub struct PagestreamGetPageResponse { pub page: Bytes, } +#[derive(Debug)] +pub struct PagestreamGetSlruSegmentResponse { + pub segment: Bytes, +} + #[derive(Debug)] pub struct PagestreamErrorResponse { pub message: String, @@ -658,15 +1174,28 @@ pub struct PagestreamDbSizeResponse { pub db_size: i64, } +// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields +// that require pageserver-internal types. It is sufficient to get the total size. +#[derive(Serialize, Deserialize, Debug)] +pub struct TenantHistorySize { + pub id: TenantId, + /// Size is a mixture of WAL and logical size, so the unit is bytes. + /// + /// Will be none if `?inputs_only=true` was given. + pub size: Option, +} + impl PagestreamFeMessage { + /// Serialize a compute -> pageserver message. This is currently only used in testing + /// tools. Always uses protocol version 2. pub fn serialize(&self) -> Bytes { let mut bytes = BytesMut::new(); match self { Self::Exists(req) => { bytes.put_u8(0); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -675,8 +1204,8 @@ impl PagestreamFeMessage { Self::Nblocks(req) => { bytes.put_u8(1); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -685,8 +1214,8 @@ impl PagestreamFeMessage { Self::GetPage(req) => { bytes.put_u8(2); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.rel.spcnode); bytes.put_u32(req.rel.dbnode); bytes.put_u32(req.rel.relnode); @@ -696,27 +1225,57 @@ impl PagestreamFeMessage { Self::DbSize(req) => { bytes.put_u8(3); - bytes.put_u8(u8::from(req.latest)); - bytes.put_u64(req.lsn.0); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); bytes.put_u32(req.dbnode); } + + Self::GetSlruSegment(req) => { + bytes.put_u8(4); + bytes.put_u64(req.request_lsn.0); + bytes.put_u64(req.not_modified_since.0); + bytes.put_u8(req.kind); + bytes.put_u32(req.segno); + } } bytes.into() } - pub fn parse(body: &mut R) -> anyhow::Result { - // TODO these gets can fail - + pub fn parse( + body: &mut R, + protocol_version: PagestreamProtocolVersion, + ) -> anyhow::Result { // these correspond to the NeonMessageTag enum in pagestore_client.h // // TODO: consider using protobuf or serde bincode for less error prone // serialization. let msg_tag = body.read_u8()?; + + let (request_lsn, not_modified_since) = match protocol_version { + PagestreamProtocolVersion::V2 => ( + Lsn::from(body.read_u64::()?), + Lsn::from(body.read_u64::()?), + ), + PagestreamProtocolVersion::V1 => { + // In the old protocol, each message starts with a boolean 'latest' flag, + // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and + // 'not_modified_since', used in the new protocol version. + let latest = body.read_u8()? != 0; + let request_lsn = Lsn::from(body.read_u64::()?); + if latest { + (Lsn::MAX, request_lsn) // get latest version + } else { + (request_lsn, request_lsn) // get version at specified LSN + } + } + }; + + // The rest of the messages are the same between V1 and V2 match msg_tag { 0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -725,8 +1284,8 @@ impl PagestreamFeMessage { }, })), 1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -735,8 +1294,8 @@ impl PagestreamFeMessage { }, })), 2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, rel: RelTag { spcnode: body.read_u32::()?, dbnode: body.read_u32::()?, @@ -746,10 +1305,18 @@ impl PagestreamFeMessage { blkno: body.read_u32::()?, })), 3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: body.read_u8()? != 0, - lsn: Lsn::from(body.read_u64::()?), + request_lsn, + not_modified_since, dbnode: body.read_u32::()?, })), + 4 => Ok(PagestreamFeMessage::GetSlruSegment( + PagestreamGetSlruSegmentRequest { + request_lsn, + not_modified_since, + kind: body.read_u8()?, + segno: body.read_u32::()?, + }, + )), _ => bail!("unknown smgr message tag: {:?}", msg_tag), } } @@ -785,6 +1352,12 @@ impl PagestreamBeMessage { bytes.put_u8(Tag::DbSize as u8); bytes.put_i64(resp.db_size); } + + Self::GetSlruSegment(resp) => { + bytes.put_u8(Tag::GetSlruSegment as u8); + bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32); + bytes.put(&resp.segment[..]); + } } bytes.into() @@ -825,6 +1398,14 @@ impl PagestreamBeMessage { let db_size = buf.read_i64::()?; Self::DbSize(PagestreamDbSizeResponse { db_size }) } + Tag::GetSlruSegment => { + let n_blocks = buf.read_u32::()?; + let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize]; + buf.read_exact(&mut segment)?; + Self::GetSlruSegment(PagestreamGetSlruSegmentResponse { + segment: segment.into(), + }) + } }; let remaining = buf.into_inner(); if !remaining.is_empty() { @@ -843,14 +1424,15 @@ impl PagestreamBeMessage { Self::GetPage(_) => "GetPage", Self::Error(_) => "Error", Self::DbSize(_) => "DbSize", + Self::GetSlruSegment(_) => "GetSlruSegment", } } } #[cfg(test)] mod tests { - use bytes::Buf; use serde_json::json; + use std::str::FromStr; use super::*; @@ -859,8 +1441,8 @@ mod tests { // Test serialization/deserialization of PagestreamFeMessage let messages = vec![ PagestreamFeMessage::Exists(PagestreamExistsRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), rel: RelTag { forknum: 1, spcnode: 2, @@ -869,8 +1451,8 @@ mod tests { }, }), PagestreamFeMessage::Nblocks(PagestreamNblocksRequest { - latest: false, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(4), rel: RelTag { forknum: 1, spcnode: 2, @@ -879,8 +1461,8 @@ mod tests { }, }), PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), rel: RelTag { forknum: 1, spcnode: 2, @@ -890,14 +1472,16 @@ mod tests { blkno: 7, }), PagestreamFeMessage::DbSize(PagestreamDbSizeRequest { - latest: true, - lsn: Lsn(4), + request_lsn: Lsn(4), + not_modified_since: Lsn(3), dbnode: 7, }), ]; for msg in messages { let bytes = msg.serialize(); - let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap(); + let reconstructed = + PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2) + .unwrap(); assert!(msg == reconstructed); } } @@ -910,6 +1494,7 @@ mod tests { state: TenantState::Active, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, + generation: None, }; let expected_active = json!({ "id": original_active.id.to_string(), @@ -930,6 +1515,7 @@ mod tests { }, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, + generation: None, }; let expected_broken = json!({ "id": original_broken.id.to_string(), @@ -1054,4 +1640,69 @@ mod tests { assert_eq!(actual, expected, "example on {line}"); } } + + #[test] + fn test_aux_file_migration_path() { + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::V1 + )); + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::V2 + )); + assert!(AuxFilePolicy::is_valid_migration_path( + None, + AuxFilePolicy::CrossValidation + )); + // Self-migration is not a valid migration path, and the caller should handle it by itself. + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::V2 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::CrossValidation + )); + // Migrations not allowed + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::V2 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::V1 + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V2), + AuxFilePolicy::CrossValidation + )); + assert!(!AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::V1), + AuxFilePolicy::CrossValidation + )); + // Migrations allowed + assert!(AuxFilePolicy::is_valid_migration_path( + Some(AuxFilePolicy::CrossValidation), + AuxFilePolicy::V2 + )); + } + + #[test] + fn test_aux_parse() { + assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2); + assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2); + assert_eq!( + AuxFilePolicy::from_str("cross-validation").unwrap(), + AuxFilePolicy::CrossValidation + ); + } } diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs new file mode 100644 index 0000000000..fc1f10e734 --- /dev/null +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -0,0 +1,6 @@ +use utils::id::TimelineId; + +#[derive(Default, serde::Serialize)] +pub struct AncestorDetached { + pub reparented_timelines: Vec, +} diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs index 0d287f7be0..f6644be635 100644 --- a/libs/pageserver_api/src/models/partitioning.rs +++ b/libs/pageserver_api/src/models/partitioning.rs @@ -1,9 +1,11 @@ use utils::lsn::Lsn; +use crate::keyspace::SparseKeySpace; + #[derive(Debug, PartialEq, Eq)] pub struct Partitioning { pub keys: crate::keyspace::KeySpace, - + pub sparse_keys: crate::keyspace::SparseKeySpace, pub at_lsn: Lsn, } @@ -32,6 +34,8 @@ impl serde::Serialize for Partitioning { let mut map = serializer.serialize_map(Some(2))?; map.serialize_key("keys")?; map.serialize_value(&KeySpace(&self.keys))?; + map.serialize_key("sparse_keys")?; + map.serialize_value(&KeySpace(&self.sparse_keys.0))?; map.serialize_key("at_lsn")?; map.serialize_value(&WithDisplay(&self.at_lsn))?; map.end() @@ -99,6 +103,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning { #[derive(serde::Deserialize)] struct De { keys: KeySpace, + sparse_keys: KeySpace, #[serde_as(as = "serde_with::DisplayFromStr")] at_lsn: Lsn, } @@ -107,6 +112,7 @@ impl<'a> serde::Deserialize<'a> for Partitioning { Ok(Self { at_lsn: de.at_lsn, keys: de.keys.0, + sparse_keys: SparseKeySpace(de.sparse_keys.0), }) } } @@ -133,6 +139,12 @@ mod tests { "030000000000000000000000000000000003" ] ], + "sparse_keys": [ + [ + "620000000000000000000000000000000000", + "620000000000000000000000000000000003" + ] + ], "at_lsn": "0/2240160" } "#; diff --git a/libs/pageserver_api/src/models/utilization.rs b/libs/pageserver_api/src/models/utilization.rs new file mode 100644 index 0000000000..e88cab5d6a --- /dev/null +++ b/libs/pageserver_api/src/models/utilization.rs @@ -0,0 +1,64 @@ +use utils::serde_system_time::SystemTime; + +/// Pageserver current utilization and scoring for how good candidate the pageserver would be for +/// the next tenant. +/// +/// See and maintain pageserver openapi spec for `/v1/utilization_score` as the truth. +/// +/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might +/// not handle full u64 values properly. +#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)] +pub struct PageserverUtilization { + /// Used disk space + #[serde(serialize_with = "ser_saturating_u63")] + pub disk_usage_bytes: u64, + /// Free disk space + #[serde(serialize_with = "ser_saturating_u63")] + pub free_space_bytes: u64, + /// Lower is better score for how good candidate for a next tenant would this pageserver be. + #[serde(serialize_with = "ser_saturating_u63")] + pub utilization_score: u64, + /// When was this snapshot captured, pageserver local time. + /// + /// Use millis to give confidence that the value is regenerated often enough. + pub captured_at: SystemTime, +} + +/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients. +/// +/// Instead of newtype, use this because a newtype would get require handling deserializing values +/// with the highest bit set which is properly parsed by serde formats, but would create a +/// conundrum on how to handle and again serialize such values at type level. It will be a few +/// years until we can use more than `i64::MAX` bytes on a disk. +fn ser_saturating_u63(value: &u64, serializer: S) -> Result { + const MAX_FORMAT_INT64: u64 = i64::MAX as u64; + + let value = (*value).min(MAX_FORMAT_INT64); + + serializer.serialize_u64(value) +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::*; + + #[test] + fn u64_max_is_serialized_as_u63_max() { + let doc = PageserverUtilization { + disk_usage_bytes: u64::MAX, + free_space_bytes: 0, + utilization_score: u64::MAX, + captured_at: SystemTime( + std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779), + ), + }; + + let s = serde_json::to_string(&doc).unwrap(); + + let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#; + + assert_eq!(s, expected); + } +} diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs index 33402ca8ba..010a9c2932 100644 --- a/libs/pageserver_api/src/reltag.rs +++ b/libs/pageserver_api/src/reltag.rs @@ -3,7 +3,7 @@ use std::cmp::Ordering; use std::fmt; use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID; -use postgres_ffi::relfile_utils::forknumber_to_name; +use postgres_ffi::relfile_utils::{forkname_to_number, forknumber_to_name, MAIN_FORKNUM}; use postgres_ffi::Oid; /// @@ -32,6 +32,9 @@ pub struct RelTag { pub relnode: Oid, } +/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. +pub type BlockNumber = u32; + impl PartialOrd for RelTag { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -65,6 +68,57 @@ impl fmt::Display for RelTag { } } +#[derive(Debug, thiserror::Error)] +pub enum ParseRelTagError { + #[error("invalid forknum")] + InvalidForknum(#[source] std::num::ParseIntError), + #[error("missing triplet member {}", .0)] + MissingTripletMember(usize), + #[error("invalid triplet member {}", .0)] + InvalidTripletMember(usize, #[source] std::num::ParseIntError), +} + +impl std::str::FromStr for RelTag { + type Err = ParseRelTagError; + + fn from_str(s: &str) -> Result { + use ParseRelTagError::*; + + // FIXME: in postgres logs this separator is dot + // Example: + // could not read block 2 in rel 1663/208101/2620.1 from page server at lsn 0/2431E6F0 + // with a regex we could get this more painlessly + let (triplet, forknum) = match s.split_once('_').or_else(|| s.split_once('.')) { + Some((t, f)) => { + let forknum = forkname_to_number(Some(f)); + let forknum = if let Ok(f) = forknum { + f + } else { + f.parse::().map_err(InvalidForknum)? + }; + + (t, Some(forknum)) + } + None => (s, None), + }; + + let mut split = triplet + .splitn(3, '/') + .enumerate() + .map(|(i, s)| s.parse::().map_err(|e| InvalidTripletMember(i, e))); + let spcnode = split.next().ok_or(MissingTripletMember(0))??; + let dbnode = split.next().ok_or(MissingTripletMember(1))??; + let relnode = split.next().ok_or(MissingTripletMember(2))??; + + Ok(RelTag { + spcnode, + forknum: forknum.unwrap_or(MAIN_FORKNUM), + dbnode, + relnode, + }) + } +} + impl RelTag { pub fn to_segfile_name(&self, segno: u32) -> String { let mut name = if self.spcnode == GLOBALTABLESPACE_OID { @@ -108,9 +162,24 @@ impl RelTag { /// These files are divided into segments, which are divided into /// pages of the same BLCKSZ as used for relation files. /// -#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] +#[derive( + Debug, + Clone, + Copy, + Hash, + Serialize, + Deserialize, + PartialEq, + Eq, + PartialOrd, + Ord, + strum_macros::EnumIter, + strum_macros::FromRepr, + enum_map::Enum, +)] +#[repr(u8)] pub enum SlruKind { - Clog, + Clog = 0, MultiXactMembers, MultiXactOffsets, } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 18ef2be523..8c5a4e6168 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -1,45 +1,87 @@ use std::{ops::RangeInclusive, str::FromStr}; -use crate::key::{is_rel_block_key, Key}; +use crate::{key::Key, models::ShardParameters}; use hex::FromHex; +use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; -use thiserror; use utils::id::TenantId; +/// See docs/rfcs/031-sharding-static.md for an overview of sharding. +/// +/// This module contains a variety of types used to represent the concept of sharding +/// a Neon tenant across multiple physical shards. Since there are quite a few of these, +/// we provide an summary here. +/// +/// Types used to describe shards: +/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value +/// which identifies a tenant which is not shard-aware. This means its storage paths do not include +/// a shard suffix. +/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. +/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` +/// without the tenant ID. This is useful for things that are implicitly scoped to a particular +/// tenant, such as layer files. +/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient +/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. +/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as +/// four hex digits. An unsharded tenant is `0000`. +/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant +/// +/// Types used to describe the parameters for data distribution in a sharded tenant: +/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across +/// multiple shards. Its value is given in 8kiB pages. +/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is +/// always zero: this is provided for future upgrades that might introduce different +/// data distribution schemes. +/// +/// Examples: +/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 +/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 +/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), +/// and their slugs are 0004, 0104, 0204, and 0304. + #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] pub struct ShardNumber(pub u8); #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] -pub struct ShardCount(pub u8); +pub struct ShardCount(u8); -impl ShardCount { - pub const MAX: Self = Self(u8::MAX); +/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, +/// when we need to know which shard we're dealing with, but do not need to know the full +/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know +/// the fully qualified TenantShardId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct ShardIndex { + pub shard_number: ShardNumber, + pub shard_count: ShardCount, } -impl ShardNumber { - pub const MAX: Self = Self(u8::MAX); +/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], +/// and to check whether that [`ShardNumber`] is the same as the current shard. +#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] +pub struct ShardIdentity { + pub number: ShardNumber, + pub count: ShardCount, + pub stripe_size: ShardStripeSize, + layout: ShardLayout, } -/// TenantShardId identify the units of work for the Pageserver. -/// -/// These are written as `-`, for example: +/// Formatting helper, for generating the `shard_id` label in traces. +struct ShardSlug<'a>(&'a TenantShardId); + +/// TenantShardId globally identifies a particular shard in a particular tenant. /// +/// These are written as `-`, for example: /// # The second shard in a two-shard tenant /// 072f1291a5310026820b2fe4b2968934-0102 /// -/// Historically, tenants could not have multiple shards, and were identified -/// by TenantId. To support this, TenantShardId has a special legacy -/// mode where `shard_count` is equal to zero: this represents a single-sharded -/// tenant which should be written as a TenantId with no suffix. +/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without +/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables +/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. /// -/// The human-readable encoding of TenantShardId, such as used in API URLs, -/// is both forward and backward compatible: a legacy TenantId can be +/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, +/// is both forward and backward compatible with TenantId: a legacy TenantId can be /// decoded as a TenantShardId, and when re-encoded it will be parseable /// as a TenantId. -/// -/// Note that the binary encoding is _not_ backward compatible, because -/// at the time sharding is introduced, there are no existing binary structures -/// containing TenantId that we need to handle. #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] pub struct TenantShardId { pub tenant_id: TenantId, @@ -47,6 +89,48 @@ pub struct TenantShardId { pub shard_count: ShardCount, } +impl ShardCount { + pub const MAX: Self = Self(u8::MAX); + + /// The internal value of a ShardCount may be zero, which means "1 shard, but use + /// legacy format for TenantShardId that excludes the shard suffix", also known + /// as [`TenantShardId::unsharded`]. + /// + /// This method returns the actual number of shards, i.e. if our internal value is + /// zero, we return 1 (unsharded tenants have 1 shard). + pub fn count(&self) -> u8 { + if self.0 > 0 { + self.0 + } else { + 1 + } + } + + /// The literal internal value: this is **not** the number of shards in the + /// tenant, as we have a special zero value for legacy unsharded tenants. Use + /// [`Self::count`] if you want to know the cardinality of shards. + pub fn literal(&self) -> u8 { + self.0 + } + + /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but + /// uses the legacy format for `TenantShardId`. See also the documentation for + /// [`Self::count`]. + pub fn is_unsharded(&self) -> bool { + self.0 == 0 + } + + /// `v` may be zero, or the number of shards in the tenant. `v` is what + /// [`Self::literal`] would return. + pub const fn new(val: u8) -> Self { + Self(val) + } +} + +impl ShardNumber { + pub const MAX: Self = Self(u8::MAX); +} + impl TenantShardId { pub fn unsharded(tenant_id: TenantId) -> Self { Self { @@ -78,18 +162,48 @@ impl TenantShardId { } /// Convenience for code that has special behavior on the 0th shard. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.shard_number == ShardNumber(0) } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { - self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) + self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() + } + + /// Convenience for dropping the tenant_id and just getting the ShardIndex: this + /// is useful when logging from code that is already in a span that includes tenant ID, to + /// keep messages reasonably terse. + pub fn to_index(&self) -> ShardIndex { + ShardIndex { + shard_number: self.shard_number, + shard_count: self.shard_count, + } + } + + /// Calculate the children of this TenantShardId when splitting the overall tenant into + /// the given number of shards. + pub fn split(&self, new_shard_count: ShardCount) -> Vec { + let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1); + let mut child_shards = Vec::new(); + for shard_number in 0..ShardNumber(new_shard_count.0).0 { + // Key mapping is based on a round robin mapping of key hash modulo shard count, + // so our child shards are the ones which the same keys would map to. + if shard_number % effective_old_shard_count == self.shard_number.0 { + child_shards.push(TenantShardId { + tenant_id: self.tenant_id, + shard_number: ShardNumber(shard_number), + shard_count: new_shard_count, + }) + } + } + + child_shards } } -/// Formatting helper -struct ShardSlug<'a>(&'a TenantShardId); - impl<'a> std::fmt::Display for ShardSlug<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -159,16 +273,6 @@ impl From<[u8; 18]> for TenantShardId { } } -/// For use within the context of a particular tenant, when we need to know which -/// shard we're dealing with, but do not need to know the full ShardIdentity (because -/// we won't be doing any page->shard mapping), and do not need to know the fully qualified -/// TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct ShardIndex { - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - impl ShardIndex { pub fn new(number: ShardNumber, count: ShardCount) -> Self { Self { @@ -183,6 +287,9 @@ impl ShardIndex { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) } @@ -250,6 +357,8 @@ impl Serialize for TenantShardId { if serializer.is_human_readable() { serializer.collect_str(self) } else { + // Note: while human encoding of [`TenantShardId`] is backward and forward + // compatible, this binary encoding is not. let mut packed: [u8; 18] = [0; 18]; packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); packed[16] = self.shard_number.0; @@ -316,6 +425,12 @@ impl<'de> Deserialize<'de> for TenantShardId { #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardStripeSize(pub u32); +impl Default for ShardStripeSize { + fn default() -> Self { + DEFAULT_STRIPE_SIZE + } +} + /// Layout version: for future upgrades where we might change how the key->shard mapping works #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardLayout(u8); @@ -327,16 +442,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); /// Default stripe size in pages: 256MiB divided by 8kiB page size. const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); -/// The ShardIdentity contains the information needed for one member of map -/// to resolve a key to a shard, and then check whether that shard is ==self. -#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] -pub struct ShardIdentity { - pub number: ShardNumber, - pub count: ShardCount, - stripe_size: ShardStripeSize, - layout: ShardLayout, -} - #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { #[error("Invalid shard count")] @@ -351,7 +456,7 @@ impl ShardIdentity { /// An identity with number=0 count=0 is a "none" identity, which represents legacy /// tenants. Modern single-shard tenants should not use this: they should /// have number=0 count=1. - pub fn unsharded() -> Self { + pub const fn unsharded() -> Self { Self { number: ShardNumber(0), count: ShardCount(0), @@ -376,6 +481,9 @@ impl ShardIdentity { } } + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. pub fn is_unsharded(&self) -> bool { self.number == ShardNumber(0) && self.count == ShardCount(0) } @@ -403,6 +511,17 @@ impl ShardIdentity { } } + /// For use when creating ShardIdentity instances for new shards, where a creation request + /// specifies the ShardParameters that apply to all shards. + pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self { + Self { + number, + count: params.count, + layout: LAYOUT_V1, + stripe_size: params.stripe_size, + } + } + fn is_broken(&self) -> bool { self.layout == LAYOUT_BROKEN } @@ -413,6 +532,8 @@ impl ShardIdentity { } /// Return true if the key should be ingested by this shard + /// + /// Shards must ingest _at least_ keys which return true from this check. pub fn is_key_local(&self, key: &Key) -> bool { assert!(!self.is_broken()); if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) { @@ -423,20 +544,32 @@ impl ShardIdentity { } /// Return true if the key should be discarded if found in this shard's - /// data store, e.g. during compaction after a split + /// data store, e.g. during compaction after a split. + /// + /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { if key_is_shard0(key) { // Q: Why can't we dispose of shard0 content if we're not shard 0? - // A: because the WAL ingestion logic currently ingests some shard 0 - // content on all shards, even though it's only read on shard 0. If we - // dropped it, then subsequent WAL ingest to these keys would encounter - // an error. + // A1: because the WAL ingestion logic currently ingests some shard 0 + // content on all shards, even though it's only read on shard 0. If we + // dropped it, then subsequent WAL ingest to these keys would encounter + // an error. + // A2: because key_is_shard0 also covers relation size keys, which are written + // on all shards even though they're only maintained accurately on shard 0. false } else { !self.is_key_local(key) } } + /// Obtains the shard number and count combined into a `ShardIndex`. + pub fn shard_index(&self) -> ShardIndex { + ShardIndex { + shard_count: self.count, + shard_number: self.number, + } + } + pub fn shard_slug(&self) -> String { if self.count > ShardCount(0) { format!("-{:02x}{:02x}", self.number.0, self.count.0) @@ -447,7 +580,7 @@ impl ShardIdentity { /// Convenience for checking if this identity is the 0th shard in a tenant, /// for special cases on shard 0 such as ingesting relation sizes. - pub fn is_zero(&self) -> bool { + pub fn is_shard_zero(&self) -> bool { self.number == ShardNumber(0) } } @@ -530,7 +663,13 @@ fn key_is_shard0(key: &Key) -> bool { // relation pages are distributed to shards other than shard zero. Everything else gets // stored on shard 0. This guarantees that shard 0 can independently serve basebackup // requests, and any request other than those for particular blocks in relations. - !is_rel_block_key(key) + // + // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table + // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0 + // because they must be included in basebackups. + let is_initfork = key.field5 == INIT_FORKNUM; + + !key.is_rel_block_key() || is_initfork } /// Provide the same result as the function in postgres `hashfn.h` with the same name @@ -577,12 +716,28 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke ShardNumber((hash % count.0 as u32) as u8) } +/// For debugging, while not exposing the internals. +#[derive(Debug)] +#[allow(unused)] // used by debug formatting by pagectl +struct KeyShardingInfo { + shard0: bool, + shard_number: ShardNumber, +} + +pub fn describe( + key: &Key, + shard_count: ShardCount, + stripe_size: ShardStripeSize, +) -> impl std::fmt::Debug { + KeyShardingInfo { + shard0: key_is_shard0(key), + shard_number: key_to_shard_number(shard_count, stripe_size, key), + } +} + #[cfg(test)] mod tests { - use std::str::FromStr; - - use bincode; - use utils::{id::TenantId, Hex}; + use utils::Hex; use super::*; @@ -773,4 +928,108 @@ mod tests { let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key); assert_eq!(shard, ShardNumber(8)); } + + #[test] + fn shard_id_split() { + let tenant_id = TenantId::generate(); + let parent = TenantShardId::unsharded(tenant_id); + + // Unsharded into 2 + assert_eq!( + parent.split(ShardCount(2)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(0) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(1) + } + ] + ); + + // Unsharded into 4 + assert_eq!( + parent.split(ShardCount(4)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(0) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(1) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(2) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(4), + shard_number: ShardNumber(3) + } + ] + ); + + // count=1 into 2 (check this works the same as unsharded.) + let parent = TenantShardId { + tenant_id, + shard_count: ShardCount(1), + shard_number: ShardNumber(0), + }; + assert_eq!( + parent.split(ShardCount(2)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(0) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(1) + } + ] + ); + + // count=2 into count=8 + let parent = TenantShardId { + tenant_id, + shard_count: ShardCount(2), + shard_number: ShardNumber(1), + }; + assert_eq!( + parent.split(ShardCount(8)), + vec![ + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(1) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(3) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(5) + }, + TenantShardId { + tenant_id, + shard_count: ShardCount(8), + shard_number: ShardNumber(7) + }, + ] + ); + } } diff --git a/libs/pageserver_api/src/control_api.rs b/libs/pageserver_api/src/upcall_api.rs similarity index 54% rename from libs/pageserver_api/src/control_api.rs rename to libs/pageserver_api/src/upcall_api.rs index 0acc3a7bb0..2e88836bd0 100644 --- a/libs/pageserver_api/src/control_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -6,19 +6,36 @@ use serde::{Deserialize, Serialize}; use utils::id::NodeId; -use crate::shard::TenantShardId; +use crate::{ + controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId, +}; +/// Upcall message sent by the pageserver to the configured `control_plane_api` on +/// startup. #[derive(Serialize, Deserialize)] pub struct ReAttachRequest { pub node_id: NodeId, + + /// Optional inline self-registration: this is useful with the storage controller, + /// if the node already has a node_id set. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub register: Option, } -#[derive(Serialize, Deserialize)] +fn default_mode() -> LocationConfigMode { + LocationConfigMode::AttachedSingle +} + +#[derive(Serialize, Deserialize, Debug)] pub struct ReAttachResponseTenant { pub id: TenantShardId, - pub gen: u32, -} + /// Mandatory if LocationConfigMode is None or set to an Attached* mode + pub gen: Option, + /// Default value only for backward compat: this field should be set + #[serde(default = "default_mode")] + pub mode: LocationConfigMode, +} #[derive(Serialize, Deserialize)] pub struct ReAttachResponse { pub tenants: Vec, diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 73d25619c3..6c41b7f347 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -6,7 +6,6 @@ #![deny(clippy::undocumented_unsafe_blocks)] use anyhow::Context; use bytes::Bytes; -use futures::pin_mut; use serde::{Deserialize, Serialize}; use std::io::ErrorKind; use std::net::SocketAddr; @@ -378,8 +377,7 @@ impl PostgresBackend { &mut self, cx: &mut std::task::Context<'_>, ) -> Poll> { - let flush_fut = self.flush(); - pin_mut!(flush_fut); + let flush_fut = std::pin::pin!(self.flush()); flush_fut.poll(cx) } @@ -822,10 +820,11 @@ impl PostgresBackend { Ok(ProcessMsgResult::Continue) } - /// Log as info/error result of handling COPY stream and send back - /// ErrorResponse if that makes sense. Shutdown the stream if we got - /// Terminate. TODO: transition into waiting for Sync msg if we initiate the - /// close. + /// - Log as info/error result of handling COPY stream and send back + /// ErrorResponse if that makes sense. + /// - Shutdown the stream if we got Terminate. + /// - Then close the connection because we don't handle exiting from COPY + /// stream normally. pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) { use CopyStreamHandlerEnd::*; @@ -851,10 +850,6 @@ impl PostgresBackend { } } - if let Terminate = &end { - self.state = ProtoState::Closed; - } - let err_to_send_and_errcode = match &end { ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)), Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)), @@ -884,6 +879,12 @@ impl PostgresBackend { error!("failed to send ErrorResponse: {}", ee); } } + + // Proper COPY stream finishing to continue using the connection is not + // implemented at the server side (we don't need it so far). To prevent + // further usages of the connection, close it. + self.framed.shutdown().await.ok(); + self.state = ProtoState::Closed; } } diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index e046fa5260..80df9db858 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -72,14 +72,19 @@ async fn simple_select() { } } -static KEY: Lazy = Lazy::new(|| { +static KEY: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("key.pem")); - rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone()) + let key = rustls_pemfile::rsa_private_keys(&mut cursor) + .next() + .unwrap() + .unwrap(); + rustls::pki_types::PrivateKeyDer::Pkcs1(key) }); -static CERT: Lazy = Lazy::new(|| { +static CERT: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("cert.pem")); - rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone()) + let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap(); + cert }); // test that basic select with ssl works @@ -88,9 +93,8 @@ async fn simple_select_ssl() { let (client_sock, server_sock) = make_tcp_pair().await; let server_cfg = rustls::ServerConfig::builder() - .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![CERT.clone()], KEY.clone()) + .with_single_cert(vec![CERT.clone()], KEY.clone_key()) .unwrap(); let tls_config = Some(Arc::new(server_cfg)); let pgbackend = @@ -102,10 +106,9 @@ async fn simple_select_ssl() { }); let client_cfg = rustls::ClientConfig::builder() - .with_safe_defaults() .with_root_certificates({ let mut store = rustls::RootCertStore::empty(); - store.add(&CERT).unwrap(); + store.add(CERT.clone()).unwrap(); store }) .with_no_client_auth(); diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index ccf9108895..9f57f3d507 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -178,6 +178,13 @@ impl PgConnectionConfig { } } +impl fmt::Display for PgConnectionConfig { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // The password is intentionally hidden and not part of this display string. + write!(f, "postgresql://{}:{}", self.host, self.port) + } +} + impl fmt::Debug for PgConnectionConfig { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // We want `password: Some(REDACTED-STRING)`, not `password: Some("REDACTED-STRING")` diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs index 8e6761d6d3..370d9e9a6f 100644 --- a/libs/postgres_ffi/build.rs +++ b/libs/postgres_ffi/build.rs @@ -126,6 +126,7 @@ fn main() -> anyhow::Result<()> { .allowlist_type("PageHeaderData") .allowlist_type("DBState") .allowlist_type("RelMapFile") + .allowlist_type("RepOriginId") // Because structs are used for serialization, tell bindgen to emit // explicit padding fields. .explicit_padding(true) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index d10ebfe277..729f57f829 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -3,7 +3,7 @@ #![allow(non_snake_case)] // bindgen creates some unsafe code with no doc comments. #![allow(clippy::missing_safety_doc)] -// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code. +// noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code. #![allow(clippy::useless_transmute)] // modules included with the postgres_ffi macro depend on the types of the specific version's // types, and trigger a too eager lint. @@ -110,6 +110,7 @@ pub mod pg_constants; pub mod relfile_utils; // Export some widely used datatypes that are unlikely to change across Postgres versions +pub use v14::bindings::RepOriginId; pub use v14::bindings::{uint32, uint64, Oid}; pub use v14::bindings::{BlockNumber, OffsetNumber}; pub use v14::bindings::{MultiXactId, TransactionId}; @@ -118,7 +119,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; // Likewise for these, although the assumption that these don't change is a little more iffy. pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; pub use v14::bindings::{PageHeaderData, XLogRecord}; -pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; +pub use v14::xlog_utils::{ + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; pub use v14::bindings::{CheckPoint, ControlFileData}; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 3bc3c4914a..980f947746 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -82,6 +82,9 @@ pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40; pub const XLOG_XACT_ASSIGNMENT: u8 = 0x50; pub const XLOG_XACT_INVALIDATIONS: u8 = 0x60; +// From standbydefs.h +pub const XLOG_RUNNING_XACTS: u8 = 0x10; + // From srlu.h pub const SLRU_PAGES_PER_SEGMENT: u32 = 32; pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usize; @@ -101,7 +104,7 @@ pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1; pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2; pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3; pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4; -// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5; +pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5; // pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6; // pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7; @@ -251,6 +254,10 @@ pub const XLOG_OVERWRITE_CONTRECORD: u8 = 0xD0; pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001; pub const XLP_LONG_HEADER: u16 = 0x0002; +/* From xlog.h */ +pub const XLOG_REPLORIGIN_SET: u8 = 0x00; +pub const XLOG_REPLORIGIN_DROP: u8 = 0x10; + /* From replication/slot.h */ pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */ + 64 /* NameData */ + 4*4; @@ -265,6 +272,9 @@ pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32; pub const VM_HEAPBLOCKS_PER_PAGE: u32 = (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) +/* From origin.c */ +pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; + // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 22] = [ diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 0ca9bd8b45..0bbb91afc2 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -119,11 +119,6 @@ pub fn generate_pg_control( // Generate new pg_control needed for bootstrap checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; - //reset some fields we don't want to preserve - //TODO Check this. - //We may need to determine the value from twophase data. - checkpoint.oldestActiveXid = 0; - //save new values in pg_control pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; @@ -207,10 +202,16 @@ pub fn find_end_of_wal( let seg_offs = curr_lsn.segment_offset(wal_seg_size); segment.seek(SeekFrom::Start(seg_offs as u64))?; // loop inside segment - loop { + while curr_lsn.segment_number(wal_seg_size) == segno { let bytes_read = segment.read(&mut buf)?; if bytes_read == 0 { - break; // EOF + debug!( + "find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}", + result, + seg_file_path, + curr_lsn.segment_offset(wal_seg_size) + ); + return Ok(result); } curr_lsn += bytes_read as u64; decoder.feed_bytes(&buf[0..bytes_read]); @@ -329,8 +330,11 @@ impl CheckPoint { /// /// Returns 'true' if the XID was updated. pub fn update_next_xid(&mut self, xid: u32) -> bool { - // nextXid should nw greater than any XID in WAL, so increment provided XID and check for wraparround. - let mut new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID); + // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround. + let mut new_xid = std::cmp::max( + xid.wrapping_add(1), + pg_constants::FIRST_NORMAL_TRANSACTION_ID, + ); // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL. // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE new_xid = @@ -366,8 +370,16 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result 0 { + assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD); + // xlp_rem_len doesn't include page header, hence the subtraction. + ( + seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD, + pg_constants::XLP_FIRST_IS_CONTRECORD, + ) } else { (0, 0) }; @@ -396,20 +408,22 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result 0 { + assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + ( + (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32, + pg_constants::XLP_FIRST_IS_CONTRECORD, + ) + } else { + (0, 0) + }; let header = XLogPageHeaderData { xlp_magic: XLOG_PAGE_MAGIC as u16, - xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { - pg_constants::XLP_FIRST_IS_CONTRECORD - } else { - 0 - }, + xlp_info, xlp_tli: PG_TLI, xlp_pageaddr: lsn.page_lsn().0, - xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 { - page_off as u32 - } else { - 0u32 - }, + xlp_rem_len, ..Default::default() // Put 0 in padding fields. }; let hdr_bytes = header.encode()?; @@ -425,11 +439,11 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result Result<()> { .init(); let arg_matches = cli().get_matches(); - let wal_craft = |arg_matches: &ArgMatches, client| { - let (intermediate_lsns, end_of_wal_lsn) = match arg_matches + let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| { + let intermediate_lsns = match arg_matches .get_one::("type") .map(|s| s.as_str()) .context("'type' is required")? @@ -25,6 +26,7 @@ fn main() -> Result<()> { LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?, a => panic!("Unknown --type argument: {a}"), }; + let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?; for lsn in intermediate_lsns { println!("intermediate_lsn = {lsn}"); } diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index 281a180e3b..6052f04d11 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -4,8 +4,9 @@ use log::*; use postgres::types::PgLsn; use postgres::Client; use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ}; -use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD}; -use std::cmp::Ordering; +use postgres_ffi::{ + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, +}; use std::path::{Path, PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; @@ -232,59 +233,62 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow pub trait Crafter { const NAME: &'static str; - /// Generates WAL using the client `client`. Returns a pair of: - /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from. - /// May include or exclude Lsn(0) and the end-of-wal. - /// * The expected end-of-wal LSN. - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)>; + /// Generates WAL using the client `client`. Returns a vector of some valid + /// "interesting" intermediate LSNs which one may start reading from. + /// test_end_of_wal uses this to check various starting points. + /// + /// Note that postgres is generally keen about writing some WAL. While we + /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always + /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about + /// stable WAL end would be flaky unless postgres is shut down. For this + /// reason returning potential end of WAL here is pointless. Most of the + /// time this doesn't happen though, so it is reasonable to create needed + /// WAL structure and immediately kill postgres like test_end_of_wal does. + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result>; } +/// Wraps some WAL craft function, providing current LSN to it before the +/// insertion and flushing WAL afterwards. Also pushes initial LSN to the +/// result. fn craft_internal( client: &mut C, - f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec, Option)>, -) -> anyhow::Result<(Vec, PgLsn)> { + f: impl Fn(&mut C, PgLsn) -> anyhow::Result>, +) -> anyhow::Result> { ensure_server_config(client)?; let initial_lsn = client.pg_current_wal_insert_lsn()?; info!("LSN initial = {}", initial_lsn); - let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?; - let last_lsn = match last_lsn { - None => client.pg_current_wal_insert_lsn()?, - Some(last_lsn) => { - let insert_lsn = client.pg_current_wal_insert_lsn()?; - match last_lsn.cmp(&insert_lsn) { - Ordering::Less => bail!( - "Some records were inserted after the crafted WAL: {} vs {}", - last_lsn, - insert_lsn - ), - Ordering::Equal => last_lsn, - Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"), - } - } - }; + let mut intermediate_lsns = f(client, initial_lsn)?; if !intermediate_lsns.starts_with(&[initial_lsn]) { intermediate_lsns.insert(0, initial_lsn); } - // Some records may be not flushed, e.g. non-transactional logical messages. - client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?; - match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) { - Ordering::Less => bail!("Some records were flushed after the crafted WAL"), - Ordering::Equal => {} - Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"), + // Some records may be not flushed, e.g. non-transactional logical messages. Flush now. + // + // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn + // returns the position just after the page header on the next page. That's where the next + // record will be inserted. But the page header hasn't actually been written to the WAL + // yet, and if you try to flush it, you get a "request to flush past end of generated WAL" + // error. Because of that, if the insert location is just after a page header, back off to + // previous page boundary. + let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?); + if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 { + lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64; + } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 { + lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64; } - Ok((intermediate_lsns, last_lsn)) + client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?; + Ok(intermediate_lsns) } pub struct Simple; impl Crafter for Simple { const NAME: &'static str = "simple"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { craft_internal(client, |client, _| { client.execute("CREATE table t(x int)", &[])?; - Ok((Vec::new(), None)) + Ok(Vec::new()) }) } } @@ -292,97 +296,114 @@ impl Crafter for Simple { pub struct LastWalRecordXlogSwitch; impl Crafter for LastWalRecordXlogSwitch { const NAME: &'static str = "last_wal_record_xlog_switch"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { - // Do not use generate_internal because here we end up with flush_lsn exactly on + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { + // Do not use craft_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; client.execute("CREATE table t(x int)", &[])?; let before_xlog_switch = client.pg_current_wal_insert_lsn()?; - let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); - let next_segment = PgLsn::from(0x0200_0000); + // pg_switch_wal returns end of last record of the switched segment, + // i.e. end of SWITCH itself. + let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + let before_xlog_switch_u64 = u64::from(before_xlog_switch); + let next_segment = PgLsn::from( + before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64) + + WAL_SEGMENT_SIZE as u64, + ); ensure!( - after_xlog_switch <= next_segment, - "XLOG_SWITCH message ended after the expected segment boundary: {} > {}", - after_xlog_switch, + xlog_switch_record_end <= next_segment, + "XLOG_SWITCH record ended after the expected segment boundary: {} > {}", + xlog_switch_record_end, next_segment ); - Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + Ok(vec![before_xlog_switch, xlog_switch_record_end]) } } pub struct LastWalRecordXlogSwitchEndsOnPageBoundary; +/// Craft xlog SWITCH record ending at page boundary. impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary { const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { // Do not use generate_internal because here we end up with flush_lsn exactly on // the segment boundary and insert_lsn after the initial page header, which is unusual. ensure_server_config(client)?; client.execute("CREATE table t(x int)", &[])?; - // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. - // We will use logical message as the padding. We start with detecting how much WAL - // it takes for one logical message, considering all alignments and headers. - let base_wal_advance = { + // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We + // will use carefully-sized logical messages to advance WAL insert location such + // that there is just enough space on the page for the XLOG_SWITCH record. + loop { + // We start with measuring how much WAL it takes for one logical message, + // considering all alignments and headers. let before_lsn = client.pg_current_wal_insert_lsn()?; - // Small non-empty message bigger than few bytes is more likely than an empty - // message to have the same format as the big padding message. client.execute( "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))", &[], )?; - // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD. - (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize - + XLOG_SIZE_OF_XLOG_RECORD - }; - let mut remaining_lsn = - XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ; - if remaining_lsn < base_wal_advance { - remaining_lsn += XLOG_BLCKSZ; - } - let repeats = 10 + remaining_lsn - base_wal_advance; - info!( - "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}", - client.pg_current_wal_insert_lsn()?, - remaining_lsn, - base_wal_advance, - repeats - ); - client.execute( - "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", - &[&(repeats as i32)], - )?; - info!( - "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", - client.pg_current_wal_insert_lsn()?, - XLOG_SIZE_OF_XLOG_RECORD - ); + let after_lsn = client.pg_current_wal_insert_lsn()?; - // Emit the XLOG_SWITCH - let before_xlog_switch = client.pg_current_wal_insert_lsn()?; - let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0); - let next_segment = PgLsn::from(0x0200_0000); - ensure!( - after_xlog_switch < next_segment, - "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}", - after_xlog_switch, - next_segment - ); - ensure!( - u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD, - "XLOG_SWITCH message ended not on page boundary: {}, offset = {}", - after_xlog_switch, - u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ - ); - Ok((vec![before_xlog_switch, after_xlog_switch], next_segment)) + // Did the record cross a page boundary? If it did, start over. Crossing a + // page boundary adds to the apparent size of the record because of the page + // header, which throws off the calculation. + if u64::from(before_lsn) / XLOG_BLCKSZ as u64 + != u64::from(after_lsn) / XLOG_BLCKSZ as u64 + { + continue; + } + // base_size is the size of a logical message without the payload + let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10; + + // Is there enough space on the page for another logical message and an + // XLOG_SWITCH? If not, start over. + let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64; + if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 { + continue; + } + + // We will write another logical message, such that after the logical message + // record, there will be space for exactly one XLOG_SWITCH. How large should + // the logical message's payload be? An XLOG_SWITCH record has no data => its + // size is exactly XLOG_SIZE_OF_XLOG_RECORD. + let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64; + + client.execute( + "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))", + &[&(repeats as i32)], + )?; + info!( + "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}", + client.pg_current_wal_insert_lsn()?, + XLOG_SIZE_OF_XLOG_RECORD + ); + + // Emit the XLOG_SWITCH + let before_xlog_switch = client.pg_current_wal_insert_lsn()?; + let xlog_switch_record_end: PgLsn = + client.query_one("SELECT pg_switch_wal()", &[])?.get(0); + + if u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ + != XLOG_SIZE_OF_XLOG_SHORT_PHD + { + warn!( + "XLOG_SWITCH message ended not on page boundary: {}, offset = {}, repeating", + xlog_switch_record_end, + u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ + ); + continue; + } + return Ok(vec![before_xlog_switch, xlog_switch_record_end]); + } } } -fn craft_single_logical_message( +/// Write ~16MB logical message; it should cross WAL segment. +fn craft_seg_size_logical_message( client: &mut impl postgres::GenericClient, transactional: bool, -) -> anyhow::Result<(Vec, PgLsn)> { +) -> anyhow::Result> { craft_internal(client, |client, initial_lsn| { ensure!( initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024), @@ -405,34 +426,24 @@ fn craft_single_logical_message( "Logical message crossed two segments" ); - if transactional { - // Transactional logical messages are part of a transaction, so the one above is - // followed by a small COMMIT record. - - let after_message_lsn = client.pg_current_wal_insert_lsn()?; - ensure!( - message_lsn < after_message_lsn, - "No record found after the emitted message" - ); - Ok((vec![message_lsn], Some(after_message_lsn))) - } else { - Ok((Vec::new(), Some(message_lsn))) - } + Ok(vec![message_lsn]) }) } pub struct WalRecordCrossingSegmentFollowedBySmallOne; impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne { const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { - craft_single_logical_message(client, true) + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { + // Transactional message crossing WAL segment will be followed by small + // commit record. + craft_seg_size_logical_message(client, true) } } pub struct LastWalRecordCrossingSegment; impl Crafter for LastWalRecordCrossingSegment { const NAME: &'static str = "last_wal_record_crossing_segment"; - fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec, PgLsn)> { - craft_single_logical_message(client, false) + fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result> { + craft_seg_size_logical_message(client, false) } } diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 6ff4c563b2..496458b2e4 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -11,13 +11,15 @@ use utils::const_assert; use utils::lsn::Lsn; fn init_logging() { - let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or( - format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"), - )) + let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!( + "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace" + ))) .is_test(true) .try_init(); } +/// Test that find_end_of_wal returns the same results as pg_dump on various +/// WALs created by Crafter. fn test_end_of_wal(test_name: &str) { use crate::*; @@ -38,13 +40,13 @@ fn test_end_of_wal(test_name: &str) { } cfg.initdb().unwrap(); let srv = cfg.start_server().unwrap(); - let (intermediate_lsns, expected_end_of_wal_partial) = - C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); + let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap(); let intermediate_lsns: Vec = intermediate_lsns .iter() .map(|&lsn| u64::from(lsn).into()) .collect(); - let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into(); + // Kill postgres. Note that it might have inserted to WAL something after + // 'craft' did its job. srv.kill(); // Check find_end_of_wal on the initial WAL @@ -56,7 +58,7 @@ fn test_end_of_wal(test_name: &str) { .filter(|fname| IsXLogFileName(fname)) .max() .unwrap(); - check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal); + let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment); for start_lsn in intermediate_lsns .iter() .chain(std::iter::once(&expected_end_of_wal)) @@ -91,11 +93,7 @@ fn test_end_of_wal(test_name: &str) { } } -fn check_pg_waldump_end_of_wal( - cfg: &crate::Conf, - last_segment: &str, - expected_end_of_wal: Lsn, -) { +fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn { // Get the actual end of WAL by pg_waldump let waldump_output = cfg .pg_waldump("000000010000000000000001", last_segment) @@ -113,11 +111,8 @@ fn check_pg_waldump_end_of_wal( } }; let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap(); - info!( - "waldump erred on {}, expected wal end at {}", - waldump_wal_end, expected_end_of_wal - ); - assert_eq!(waldump_wal_end, expected_end_of_wal); + info!("waldump erred on {}", waldump_wal_end); + waldump_wal_end } fn check_end_of_wal( @@ -210,9 +205,9 @@ pub fn test_update_next_xid() { #[test] pub fn test_encode_logical_message() { let expected = [ - 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, - 38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, - 101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38, + 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102, + 105, 120, 0, 109, 101, 115, 115, 97, 103, 101, ]; let actual = encode_logical_message("prefix", "message"); assert_eq!(expected, actual[..]); diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml index b286eb0358..8afabe670e 100644 --- a/libs/pq_proto/Cargo.toml +++ b/libs/pq_proto/Cargo.toml @@ -7,11 +7,13 @@ license.workspace = true [dependencies] bytes.workspace = true byteorder.workspace = true +itertools.workspace = true pin-project-lite.workspace = true postgres-protocol.workspace = true rand.workspace = true tokio.workspace = true tracing.workspace = true thiserror.workspace = true +serde.workspace = true workspace_hack.workspace = true diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index c52a21bcd3..cee3742017 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -7,7 +7,9 @@ pub mod framed; use byteorder::{BigEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use std::{borrow::Cow, collections::HashMap, fmt, io, str}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use std::{borrow::Cow, fmt, io, str}; // re-export for use in utils pageserver_feedback.rs pub use postgres_protocol::PG_EPOCH; @@ -49,15 +51,37 @@ pub enum FeStartupPacket { }, } -#[derive(Debug)] +#[derive(Debug, Clone, Default)] +pub struct StartupMessageParamsBuilder { + params: BytesMut, +} + +impl StartupMessageParamsBuilder { + /// Set parameter's value by its name. + /// name and value must not contain a \0 byte + pub fn insert(&mut self, name: &str, value: &str) { + self.params.put(name.as_bytes()); + self.params.put(&b"\0"[..]); + self.params.put(value.as_bytes()); + self.params.put(&b"\0"[..]); + } + + pub fn freeze(self) -> StartupMessageParams { + StartupMessageParams { + params: self.params.freeze(), + } + } +} + +#[derive(Debug, Clone, Default)] pub struct StartupMessageParams { - params: HashMap, + params: Bytes, } impl StartupMessageParams { /// Get parameter's value by its name. pub fn get(&self, name: &str) -> Option<&str> { - self.params.get(name).map(|s| s.as_str()) + self.iter().find_map(|(k, v)| (k == name).then_some(v)) } /// Split command-line options according to PostgreSQL's logic, @@ -111,19 +135,23 @@ impl StartupMessageParams { /// Iterate through key-value pairs in an arbitrary order. pub fn iter(&self) -> impl Iterator { - self.params.iter().map(|(k, v)| (k.as_str(), v.as_str())) + let params = + std::str::from_utf8(&self.params).expect("should be validated as utf8 already"); + params.split_terminator('\0').tuples() } // This function is mostly useful in tests. #[doc(hidden)] pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self { - Self { - params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(), + let mut b = StartupMessageParamsBuilder::default(); + for (k, v) in pairs { + b.insert(k, v) } + b.freeze() } } -#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub struct CancelKeyData { pub backend_pid: i32, pub cancel_key: i32, @@ -344,35 +372,21 @@ impl FeStartupPacket { (major_version, minor_version) => { // StartupMessage - // Parse pairs of null-terminated strings (key, value). - // See `postgres: ProcessStartupPacket, build_startup_packet`. - let mut tokens = str::from_utf8(&msg) - .map_err(|_e| { - ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned()) - })? - .strip_suffix('\0') // drop packet's own null - .ok_or_else(|| { - ProtocolError::Protocol( - "StartupMessage params: missing null terminator".to_string(), - ) - })? - .split_terminator('\0'); - - let mut params = HashMap::new(); - while let Some(name) = tokens.next() { - let value = tokens.next().ok_or_else(|| { - ProtocolError::Protocol( - "StartupMessage params: key without value".to_string(), - ) - })?; - - params.insert(name.to_owned(), value.to_owned()); - } + let s = str::from_utf8(&msg).map_err(|_e| { + ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned()) + })?; + let s = s.strip_suffix('\0').ok_or_else(|| { + ProtocolError::Protocol( + "StartupMessage params: missing null terminator".to_string(), + ) + })?; FeStartupPacket::StartupMessage { major_version, minor_version, - params: StartupMessageParams { params }, + params: StartupMessageParams { + params: msg.slice_ref(s.as_bytes()), + }, } } }; diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 2cc59a947b..78da01c9a0 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -15,11 +15,14 @@ aws-sdk-s3.workspace = true aws-credential-types.workspace = true bytes.workspace = true camino.workspace = true +humantime.workspace = true hyper = { workspace = true, features = ["stream"] } futures.workspace = true +rand.workspace = true serde.workspace = true serde_json.workspace = true tokio = { workspace = true, features = ["sync", "fs", "io-util"] } +tokio-stream.workspace = true tokio-util = { workspace = true, features = ["compat"] } toml_edit.workspace = true tracing.workspace = true @@ -35,6 +38,7 @@ azure_storage_blobs.workspace = true futures-util.workspace = true http-types.workspace = true itertools.workspace = true +sync_wrapper = { workspace = true, features = ["futures"] } [dev-dependencies] camino-tempfile.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 18cf5d97ba..dbd64fb5a6 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -3,9 +3,14 @@ use std::borrow::Cow; use std::collections::HashMap; use std::env; +use std::fmt::Display; +use std::io; use std::num::NonZeroU32; use std::pin::Pin; +use std::str::FromStr; use std::sync::Arc; +use std::time::Duration; +use std::time::SystemTime; use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Result; @@ -13,18 +18,24 @@ use azure_core::request_options::{MaxResults, Metadata, Range}; use azure_core::RetryOptions; use azure_identity::DefaultAzureCredential; use azure_storage::StorageCredentials; +use azure_storage_blobs::blob::CopyStatus; use azure_storage_blobs::prelude::ClientBuilder; use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; use bytes::Bytes; +use futures::future::Either; use futures::stream::Stream; use futures_util::StreamExt; -use http_types::StatusCode; +use futures_util::TryStreamExt; +use http_types::{StatusCode, Url}; +use scopeguard::ScopeGuard; +use tokio_util::sync::CancellationToken; use tracing::debug; +use utils::backoff; -use crate::s3_bucket::RequestKind; +use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; use crate::{ - AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, - RemoteStorage, StorageMetadata, + error::Cancelled, AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, + ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, }; pub struct AzureBlobStorage { @@ -32,16 +43,21 @@ pub struct AzureBlobStorage { prefix_in_container: Option, max_keys_per_list_response: Option, concurrency_limiter: ConcurrencyLimiter, + // Per-request timeout. Accessible for tests. + pub timeout: Duration, } impl AzureBlobStorage { - pub fn new(azure_config: &AzureConfig) -> Result { + pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result { debug!( "Creating azure remote storage for azure container {}", azure_config.container_name ); - let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT"); + // Use the storage account from the config by default, fall back to env var if not present. + let account = azure_config.storage_account.clone().unwrap_or_else(|| { + env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT") + }); // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that, // otherwise try the token based credentials. @@ -72,6 +88,7 @@ impl AzureBlobStorage { prefix_in_container: azure_config.prefix_in_container.to_owned(), max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), + timeout, }) } @@ -114,21 +131,47 @@ impl AzureBlobStorage { async fn download_for_builder( &self, builder: GetBlobBuilder, + cancel: &CancellationToken, ) -> Result { - let mut response = builder.into_stream(); + let kind = RequestKind::Get; + + let _permit = self.permit(kind, cancel).await?; + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); let mut etag = None; let mut last_modified = None; let mut metadata = HashMap::new(); - // TODO give proper streaming response instead of buffering into RAM - // https://github.com/neondatabase/neon/issues/5563 - let mut bufs = Vec::new(); - while let Some(part) = response.next().await { - let part = part.map_err(to_download_error)?; - let etag_str: &str = part.blob.properties.etag.as_ref(); + let started_at = start_measuring_requests(kind); + + let download = async { + let response = builder + // convert to concrete Pageable + .into_stream() + // convert to TryStream + .into_stream() + .map_err(to_download_error); + + // apply per request timeout + let response = tokio_stream::StreamExt::timeout(response, self.timeout); + + // flatten + let response = response.map(|res| match res { + Ok(res) => res, + Err(_elapsed) => Err(DownloadError::Timeout), + }); + + let mut response = Box::pin(response); + + let Some(part) = response.next().await else { + return Err(DownloadError::Other(anyhow::anyhow!( + "Azure GET response contained no response body" + ))); + }; + let part = part?; if etag.is_none() { - etag = Some(etag.unwrap_or_else(|| etag_str.to_owned())); + etag = Some(part.blob.properties.etag); } if last_modified.is_none() { last_modified = Some(part.blob.properties.last_modified.into()); @@ -136,26 +179,64 @@ impl AzureBlobStorage { if let Some(blob_meta) = part.blob.metadata { metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned()))); } - let data = part + + // unwrap safety: if these were None, bufs would be empty and we would have returned an error already + let etag = etag.unwrap(); + let last_modified = last_modified.unwrap(); + + let tail_stream = response + .map(|part| match part { + Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))), + Err(e) => { + Either::Right(futures::stream::once(async { Err(io::Error::other(e)) })) + } + }) + .flatten(); + let stream = part .data - .collect() - .await - .map_err(|e| DownloadError::Other(e.into()))?; - bufs.push(data); - } - Ok(Download { - download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))), - etag, - last_modified, - metadata: Some(StorageMetadata(metadata)), - }) + .map(|r| r.map_err(io::Error::other)) + .chain(sync_wrapper::SyncStream::new(tail_stream)); + //.chain(SyncStream::from_pin(Box::pin(tail_stream))); + + let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream); + + Ok(Download { + download_stream: Box::pin(download_stream), + etag, + last_modified, + metadata: Some(StorageMetadata(metadata)), + }) + }; + + let download = tokio::select! { + bufs = download => bufs, + cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout { + TimeoutOrCancel::Timeout => return Err(DownloadError::Timeout), + TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled), + }, + }; + let started_at = ScopeGuard::into_inner(started_at); + let outcome = match &download { + Ok(_) => AttemptOutcome::Ok, + Err(_) => AttemptOutcome::Err, + }; + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, outcome, started_at); + download } - async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> { - self.concurrency_limiter - .acquire(kind) - .await - .expect("semaphore is never closed") + async fn permit( + &self, + kind: RequestKind, + cancel: &CancellationToken, + ) -> Result, Cancelled> { + let acquire = self.concurrency_limiter.acquire(kind); + + tokio::select! { + permit = acquire => Ok(permit.expect("never closed")), + _ = cancel.cancelled() => Err(Cancelled), + } } } @@ -179,59 +260,93 @@ fn to_download_error(error: azure_core::Error) -> DownloadError { } } -#[async_trait::async_trait] impl RemoteStorage for AzureBlobStorage { async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, ) -> anyhow::Result { - // get the passed prefix or if it is not set use prefix_in_bucket value - let list_prefix = prefix - .map(|p| self.relative_path_to_name(p)) - .or_else(|| self.prefix_in_container.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - } - p + let _permit = self.permit(RequestKind::List, cancel).await?; + + let op = async { + // get the passed prefix or if it is not set use prefix_in_bucket value + let list_prefix = prefix + .map(|p| self.relative_path_to_name(p)) + .or_else(|| self.prefix_in_container.clone()) + .map(|mut p| { + // required to end with a separator + // otherwise request will return only the entry of a prefix + if matches!(mode, ListingMode::WithDelimiter) + && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) + { + p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + } + p + }); + + let mut builder = self.client.list_blobs(); + + if let ListingMode::WithDelimiter = mode { + builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + } + + if let Some(prefix) = list_prefix { + builder = builder.prefix(Cow::from(prefix.to_owned())); + } + + if let Some(limit) = self.max_keys_per_list_response { + builder = builder.max_results(MaxResults::new(limit)); + } + + let response = builder.into_stream(); + let response = response.into_stream().map_err(to_download_error); + let response = tokio_stream::StreamExt::timeout(response, self.timeout); + let response = response.map(|res| match res { + Ok(res) => res, + Err(_elapsed) => Err(DownloadError::Timeout), }); - let mut builder = self.client.list_blobs(); + let mut response = std::pin::pin!(response); - if let ListingMode::WithDelimiter = mode { - builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + let mut res = Listing::default(); + + let mut max_keys = max_keys.map(|mk| mk.get()); + while let Some(entry) = response.next().await { + let entry = entry?; + let prefix_iter = entry + .blobs + .prefixes() + .map(|prefix| self.name_to_relative_path(&prefix.name)); + res.prefixes.extend(prefix_iter); + + let blob_iter = entry + .blobs + .blobs() + .map(|k| self.name_to_relative_path(&k.name)); + + for key in blob_iter { + res.keys.push(key); + + if let Some(mut mk) = max_keys { + assert!(mk > 0); + mk -= 1; + if mk == 0 { + return Ok(res); // limit reached + } + max_keys = Some(mk); + } + } + } + + Ok(res) + }; + + tokio::select! { + res = op => res, + _ = cancel.cancelled() => Err(DownloadError::Cancelled), } - - if let Some(prefix) = list_prefix { - builder = builder.prefix(Cow::from(prefix.to_owned())); - } - - if let Some(limit) = self.max_keys_per_list_response { - builder = builder.max_results(MaxResults::new(limit)); - } - - let mut response = builder.into_stream(); - let mut res = Listing::default(); - while let Some(l) = response.next().await { - let entry = l.map_err(to_download_error)?; - let prefix_iter = entry - .blobs - .prefixes() - .map(|prefix| self.name_to_relative_path(&prefix.name)); - res.prefixes.extend(prefix_iter); - - let blob_iter = entry - .blobs - .blobs() - .map(|k| self.name_to_relative_path(&k.name)); - res.keys.extend(blob_iter); - } - Ok(res) } async fn upload( @@ -240,35 +355,66 @@ impl RemoteStorage for AzureBlobStorage { data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Put).await; - let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + let kind = RequestKind::Put; + let _permit = self.permit(kind, cancel).await?; - let from: Pin> + Send + Sync + 'static>> = - Box::pin(from); + let started_at = start_measuring_requests(kind); - let from = NonSeekableStream::new(from, data_size_bytes); + let op = async { + let blob_client = self.client.blob_client(self.relative_path_to_name(to)); - let body = azure_core::Body::SeekableStream(Box::new(from)); + let from: Pin> + Send + Sync + 'static>> = + Box::pin(from); - let mut builder = blob_client.put_block_blob(body); + let from = NonSeekableStream::new(from, data_size_bytes); - if let Some(metadata) = metadata { - builder = builder.metadata(to_azure_metadata(metadata)); - } + let body = azure_core::Body::SeekableStream(Box::new(from)); - let _response = builder.into_future().await?; + let mut builder = blob_client.put_block_blob(body); - Ok(()) + if let Some(metadata) = metadata { + builder = builder.metadata(to_azure_metadata(metadata)); + } + + let fut = builder.into_future(); + let fut = tokio::time::timeout(self.timeout, fut); + + match fut.await { + Ok(Ok(_response)) => Ok(()), + Ok(Err(azure)) => Err(azure.into()), + Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), + } + }; + + let res = tokio::select! { + res = op => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; + + let outcome = match res { + Ok(_) => AttemptOutcome::Ok, + Err(_) => AttemptOutcome::Err, + }; + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, outcome, started_at); + + res } - async fn download(&self, from: &RemotePath) -> Result { - let _permit = self.permit(RequestKind::Get).await; + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { let blob_client = self.client.blob_client(self.relative_path_to_name(from)); let builder = blob_client.get(); - self.download_for_builder(builder).await + self.download_for_builder(builder, cancel).await } async fn download_byte_range( @@ -276,8 +422,8 @@ impl RemoteStorage for AzureBlobStorage { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { - let _permit = self.permit(RequestKind::Get).await; let blob_client = self.client.blob_client(self.relative_path_to_name(from)); let mut builder = blob_client.get(); @@ -289,44 +435,173 @@ impl RemoteStorage for AzureBlobStorage { }; builder = builder.range(range); - self.download_for_builder(builder).await + self.download_for_builder(builder, cancel).await } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { - let _permit = self.permit(RequestKind::Delete).await; - let blob_client = self.client.blob_client(self.relative_path_to_name(path)); + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { + self.delete_objects(std::array::from_ref(path), cancel) + .await + } - let builder = blob_client.delete(); + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let kind = RequestKind::Delete; + let _permit = self.permit(kind, cancel).await?; + let started_at = start_measuring_requests(kind); - match builder.into_future().await { - Ok(_response) => Ok(()), - Err(e) => { - if let Some(http_err) = e.as_http_error() { - if http_err.status() == StatusCode::NotFound { - return Ok(()); + let op = async { + // TODO batch requests are not supported by the SDK + // https://github.com/Azure/azure-sdk-for-rust/issues/1068 + for path in paths { + #[derive(Debug)] + enum AzureOrTimeout { + AzureError(azure_core::Error), + Timeout, + Cancel, + } + impl Display for AzureOrTimeout { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") } } - Err(anyhow::Error::new(e)) + let warn_threshold = 3; + let max_retries = 5; + backoff::retry( + || async { + let blob_client = self.client.blob_client(self.relative_path_to_name(path)); + + let request = blob_client.delete().into_future(); + + let res = tokio::time::timeout(self.timeout, request).await; + + match res { + Ok(Ok(_v)) => Ok(()), + Ok(Err(azure_err)) => { + if let Some(http_err) = azure_err.as_http_error() { + if http_err.status() == StatusCode::NotFound { + return Ok(()); + } + } + Err(AzureOrTimeout::AzureError(azure_err)) + } + Err(_elapsed) => Err(AzureOrTimeout::Timeout), + } + }, + |err| match err { + AzureOrTimeout::AzureError(_) | AzureOrTimeout::Timeout => false, + AzureOrTimeout::Cancel => true, + }, + warn_threshold, + max_retries, + "deleting remote object", + cancel, + ) + .await + .ok_or_else(|| AzureOrTimeout::Cancel) + .and_then(|x| x) + .map_err(|e| match e { + AzureOrTimeout::AzureError(err) => anyhow::Error::from(err), + AzureOrTimeout::Timeout => TimeoutOrCancel::Timeout.into(), + AzureOrTimeout::Cancel => TimeoutOrCancel::Cancel.into(), + })?; } - } + Ok(()) + }; + + let res = tokio::select! { + res = op => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; + + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + res } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { - // Permit is already obtained by inner delete function + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let kind = RequestKind::Copy; + let _permit = self.permit(kind, cancel).await?; + let started_at = start_measuring_requests(kind); - // TODO batch requests are also not supported by the SDK - // https://github.com/Azure/azure-sdk-for-rust/issues/1068 - // https://github.com/Azure/azure-sdk-for-rust/issues/1249 - for path in paths { - self.delete(path).await?; - } - Ok(()) + let timeout = tokio::time::sleep(self.timeout); + + let mut copy_status = None; + + let op = async { + let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + + let source_url = format!( + "{}/{}", + self.client.url()?, + self.relative_path_to_name(from) + ); + + let builder = blob_client.copy(Url::from_str(&source_url)?); + let copy = builder.into_future(); + + let result = copy.await?; + + copy_status = Some(result.copy_status); + loop { + match copy_status.as_ref().expect("we always set it to Some") { + CopyStatus::Aborted => { + anyhow::bail!("Received abort for copy from {from} to {to}."); + } + CopyStatus::Failed => { + anyhow::bail!("Received failure response for copy from {from} to {to}."); + } + CopyStatus::Success => return Ok(()), + CopyStatus::Pending => (), + } + // The copy is taking longer. Waiting a second and then re-trying. + // TODO estimate time based on copy_progress and adjust time based on that + tokio::time::sleep(Duration::from_millis(1000)).await; + let properties = blob_client.get_properties().into_future().await?; + let Some(status) = properties.blob.properties.copy_status else { + tracing::warn!("copy_status for copy is None!, from={from}, to={to}"); + return Ok(()); + }; + copy_status = Some(status); + } + }; + + let res = tokio::select! { + res = op => res, + _ = cancel.cancelled() => return Err(anyhow::Error::new(TimeoutOrCancel::Cancel)), + _ = timeout => { + let e = anyhow::Error::new(TimeoutOrCancel::Timeout); + let e = e.context(format!("Timeout, last status: {copy_status:?}")); + Err(e) + }, + }; + + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + res } - async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> { - Err(anyhow::anyhow!( - "copy for azure blob storage is not implemented" - )) + async fn time_travel_recover( + &self, + _prefix: Option<&RemotePath>, + _timestamp: SystemTime, + _done_if_after: SystemTime, + _cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { + // TODO use Azure point in time recovery feature for this + // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview + Err(TimeTravelError::Unimplemented) } } diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs new file mode 100644 index 0000000000..66422853e1 --- /dev/null +++ b/libs/remote_storage/src/error.rs @@ -0,0 +1,200 @@ +/// Reasons for downloads or listings to fail. +#[derive(Debug)] +pub enum DownloadError { + /// Validation or other error happened due to user input. + BadInput(anyhow::Error), + /// The file was not found in the remote storage. + NotFound, + /// A cancellation token aborted the download, typically during + /// tenant detach or process shutdown. + Cancelled, + /// A timeout happened while executing the request. Possible reasons: + /// - stuck tcp connection + /// + /// Concurrency control is not timed within timeout. + Timeout, + /// The file was found in the remote storage, but the download failed. + Other(anyhow::Error), +} + +impl std::fmt::Display for DownloadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DownloadError::BadInput(e) => { + write!(f, "Failed to download a remote file due to user input: {e}") + } + DownloadError::NotFound => write!(f, "No file found for the remote object id given"), + DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), + DownloadError::Timeout => write!(f, "timeout"), + DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), + } + } +} + +impl std::error::Error for DownloadError {} + +impl DownloadError { + /// Returns true if the error should not be retried with backoff + pub fn is_permanent(&self) -> bool { + use DownloadError::*; + match self { + BadInput(_) | NotFound | Cancelled => true, + Timeout | Other(_) => false, + } + } +} + +impl From for DownloadError { + fn from(value: std::io::Error) -> Self { + let needs_unwrap = value.kind() == std::io::ErrorKind::Other + && value + .get_ref() + .and_then(|x| x.downcast_ref::()) + .is_some(); + + if needs_unwrap { + *value + .into_inner() + .expect("just checked") + .downcast::() + .expect("just checked") + } else { + DownloadError::Other(value.into()) + } + } +} + +#[derive(Debug)] +pub enum TimeTravelError { + /// Validation or other error happened due to user input. + BadInput(anyhow::Error), + /// The used remote storage does not have time travel recovery implemented + Unimplemented, + /// The number of versions/deletion markers is above our limit. + TooManyVersions, + /// A cancellation token aborted the process, typically during + /// request closure or process shutdown. + Cancelled, + /// Other errors + Other(anyhow::Error), +} + +impl std::fmt::Display for TimeTravelError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TimeTravelError::BadInput(e) => { + write!( + f, + "Failed to time travel recover a prefix due to user input: {e}" + ) + } + TimeTravelError::Unimplemented => write!( + f, + "time travel recovery is not implemented for the current storage backend" + ), + TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"), + TimeTravelError::TooManyVersions => { + write!(f, "Number of versions/delete markers above limit") + } + TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"), + } + } +} + +impl std::error::Error for TimeTravelError {} + +/// Plain cancelled error. +/// +/// By design this type does not not implement `std::error::Error` so it cannot be put as the root +/// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this +/// crate. +/// +/// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning +/// operations and ensuring that those get converted to proper versions with just `?`. +#[derive(Debug)] +pub(crate) struct Cancelled; + +impl From for anyhow::Error { + fn from(_: Cancelled) -> Self { + anyhow::Error::new(TimeoutOrCancel::Cancel) + } +} + +impl From for TimeTravelError { + fn from(_: Cancelled) -> Self { + TimeTravelError::Cancelled + } +} + +impl From for TimeoutOrCancel { + fn from(_: Cancelled) -> Self { + TimeoutOrCancel::Cancel + } +} + +impl From for DownloadError { + fn from(_: Cancelled) -> Self { + DownloadError::Cancelled + } +} + +/// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning +/// RemoteStorage methods. +/// +/// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is +/// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors. +#[derive(Debug)] +pub enum TimeoutOrCancel { + Timeout, + Cancel, +} + +impl std::fmt::Display for TimeoutOrCancel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use TimeoutOrCancel::*; + match self { + Timeout => write!(f, "timeout"), + Cancel => write!(f, "cancel"), + } + } +} + +impl std::error::Error for TimeoutOrCancel {} + +impl TimeoutOrCancel { + /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`]. + pub fn caused_by_cancel(error: &anyhow::Error) -> bool { + error + .root_cause() + .downcast_ref::() + .is_some_and(Self::is_cancel) + } + + pub fn is_cancel(&self) -> bool { + matches!(self, TimeoutOrCancel::Cancel) + } + + pub fn is_timeout(&self) -> bool { + matches!(self, TimeoutOrCancel::Timeout) + } +} + +/// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or +/// timeout to wrap it in an `std::io::Error`. +impl From for std::io::Error { + fn from(value: TimeoutOrCancel) -> Self { + let e = DownloadError::from(value); + std::io::Error::other(e) + } +} + +impl From for DownloadError { + fn from(value: TimeoutOrCancel) -> Self { + use TimeoutOrCancel::*; + + match value { + Timeout => DownloadError::Timeout, + Cancel => DownloadError::Cancelled, + } + } +} diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 942d0016b0..72748e156c 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -10,21 +10,32 @@ #![deny(clippy::undocumented_unsafe_blocks)] mod azure_blob; +mod error; mod local_fs; +mod metrics; mod s3_bucket; mod simulate_failures; +mod support; use std::{ - collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime, + collections::HashMap, + fmt::Debug, + num::{NonZeroU32, NonZeroUsize}, + pin::Pin, + str::FromStr, + sync::Arc, + time::{Duration, SystemTime}, }; use anyhow::{bail, Context}; +use aws_sdk_s3::types::StorageClass; use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; use futures::stream::Stream; use serde::{Deserialize, Serialize}; use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; use toml_edit::Item; use tracing::info; @@ -34,17 +45,22 @@ pub use self::{ }; use s3_bucket::RequestKind; +/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here. +pub use azure_core::Etag; + +pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel}; + /// Currently, sync happens with AWS S3, that has two limits on requests per second: /// ~200 RPS for IAM services /// /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; -/// We set this a little bit low as we currently buffer the entire file into RAM +/// Set this limit analogously to the S3 limit /// /// Here, a limit of max 20k concurrent connections was noted. /// -pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30; +pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100; /// No limits on the client side, which currenltly means 1000 for AWS S3. /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; @@ -106,8 +122,8 @@ impl RemotePath { self.0.file_name() } - pub fn join(&self, segment: &Utf8Path) -> Self { - Self(self.0.join(segment)) + pub fn join(&self, path: impl AsRef) -> Self { + Self(self.0.join(path)) } pub fn get_path(&self) -> &Utf8PathBuf { @@ -121,6 +137,11 @@ impl RemotePath { pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> { self.0.strip_prefix(&p.0) } + + pub fn add_trailing_slash(&self) -> Self { + // Unwrap safety inputs are guararnteed to be valid UTF-8 + Self(format!("{}/", self.0).try_into().unwrap()) + } } /// We don't need callers to be able to pass arbitrary delimiters: just control @@ -142,45 +163,35 @@ pub struct Listing { /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. -#[async_trait::async_trait] +#[allow(async_fn_in_trait)] pub trait RemoteStorage: Send + Sync + 'static { - /// Lists all top level subdirectories for a given prefix - /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id - /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS) - /// so this method doesnt need to. - async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - ) -> Result, DownloadError> { - let result = self - .list(prefix, ListingMode::WithDelimiter) - .await? - .prefixes; - Ok(result) - } - /// Lists all files in directory "recursively" - /// (not really recursively, because AWS has a flat namespace) - /// Note: This is subtely different than list_prefixes, - /// because it is for listing files instead of listing - /// names sharing common prefixes. - /// For example, - /// list_files("foo/bar") = ["foo/bar/cat123.txt", - /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"] - /// whereas, - /// list_prefixes("foo/bar/") = ["cat", "dog"] - /// See `test_real_s3.rs` for more details. - async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result> { - let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys; - Ok(result) - } - + /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2. + /// (see ``) + /// + /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not + /// from the absolute root of the bucket. + /// + /// `mode` configures whether to use a delimiter. Without a delimiter all keys + /// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of + /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are + /// returned in `keys` (). + /// + /// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function + /// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on + /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure. + /// async fn list( &self, prefix: Option<&RemotePath>, _mode: ListingMode, - ) -> anyhow::Result; + max_keys: Option, + cancel: &CancellationToken, + ) -> Result; /// Streams the local file contents into remote into the remote storage entry. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will be + /// set to `TimeoutOrCancel`. async fn upload( &self, from: impl Stream> + Send + Sync + 'static, @@ -189,36 +200,85 @@ pub trait RemoteStorage: Send + Sync + 'static { data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()>; - /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Streams the remote storage entry contents. + /// + /// The returned download stream will obey initial timeout and cancellation signal by erroring + /// on whichever happens first. Only one of the reasons will fail the stream, which is usually + /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out. + /// /// Returns the metadata, if any was stored with the file previously. - async fn download(&self, from: &RemotePath) -> Result; + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result; - /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer. + /// Streams a given byte range of the remote storage entry contents. + /// + /// The returned download stream will obey initial timeout and cancellation signal by erroring + /// on whichever happens first. Only one of the reasons will fail the stream, which is usually + /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out. + /// /// Returns the metadata, if any was stored with the file previously. async fn download_byte_range( &self, from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result; - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>; + /// Delete a single path from remote storage. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will be + /// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through. + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>; - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>; + /// Delete a multiple paths from remote storage. + /// + /// If the operation fails because of timeout or cancellation, the root cause of the error will be + /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went + /// through. + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()>; /// Copy a remote object inside a bucket from one path to another. - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>; + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()>; + + /// Resets the content of everything with the given prefix to the given state + async fn time_travel_recover( + &self, + prefix: Option<&RemotePath>, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError>; } -pub type DownloadStream = Pin> + Unpin + Send + Sync>>; +/// DownloadStream is sensitive to the timeout and cancellation used with the original +/// [`RemoteStorage::download`] request. The type yields `std::io::Result` to be compatible +/// with `tokio::io::copy_buf`. +// This has 'static because safekeepers do not use cancellation tokens (yet) +pub type DownloadStream = + Pin> + Send + Sync + 'static>>; + pub struct Download { pub download_stream: DownloadStream, /// The last time the file was modified (`last-modified` HTTP header) - pub last_modified: Option, + pub last_modified: SystemTime, /// A way to identify this specific version of the resource (`etag` HTTP header) - pub etag: Option, + pub etag: Etag, /// Extra key-value data, associated with the current remote file. pub metadata: Option, } @@ -231,106 +291,60 @@ impl Debug for Download { } } -#[derive(Debug)] -pub enum DownloadError { - /// Validation or other error happened due to user input. - BadInput(anyhow::Error), - /// The file was not found in the remote storage. - NotFound, - /// A cancellation token aborted the download, typically during - /// tenant detach or process shutdown. - Cancelled, - /// The file was found in the remote storage, but the download failed. - Other(anyhow::Error), -} - -impl std::fmt::Display for DownloadError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - DownloadError::BadInput(e) => { - write!(f, "Failed to download a remote file due to user input: {e}") - } - DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), - DownloadError::NotFound => write!(f, "No file found for the remote object id given"), - DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), - } - } -} - -impl std::error::Error for DownloadError {} - /// Every storage, currently supported. /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics. #[derive(Clone)] -pub enum GenericRemoteStorage { +// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925 +pub enum GenericRemoteStorage> { LocalFs(LocalFs), AwsS3(Arc), AzureBlob(Arc), - Unreliable(Arc), + Unreliable(Other), } -impl GenericRemoteStorage { +impl GenericRemoteStorage> { pub async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, ) -> anyhow::Result { match self { - Self::LocalFs(s) => s.list(prefix, mode).await, - Self::AwsS3(s) => s.list(prefix, mode).await, - Self::AzureBlob(s) => s.list(prefix, mode).await, - Self::Unreliable(s) => s.list(prefix, mode).await, - } - } - - // A function for listing all the files in a "directory" - // Example: - // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"] - pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { - match self { - Self::LocalFs(s) => s.list_files(folder).await, - Self::AwsS3(s) => s.list_files(folder).await, - Self::AzureBlob(s) => s.list_files(folder).await, - Self::Unreliable(s) => s.list_files(folder).await, - } - } - - // lists common *prefixes*, if any of files - // Example: - // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"] - pub async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - ) -> Result, DownloadError> { - match self { - Self::LocalFs(s) => s.list_prefixes(prefix).await, - Self::AwsS3(s) => s.list_prefixes(prefix).await, - Self::AzureBlob(s) => s.list_prefixes(prefix).await, - Self::Unreliable(s) => s.list_prefixes(prefix).await, + Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await, + Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await, + Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await, + Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await, } } + /// See [`RemoteStorage::upload`] pub async fn upload( &self, from: impl Stream> + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await, - Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await, - Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await, - Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, + Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, + Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, + Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await, } } - pub async fn download(&self, from: &RemotePath) -> Result { + pub async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { match self { - Self::LocalFs(s) => s.download(from).await, - Self::AwsS3(s) => s.download(from).await, - Self::AzureBlob(s) => s.download(from).await, - Self::Unreliable(s) => s.download(from).await, + Self::LocalFs(s) => s.download(from, cancel).await, + Self::AwsS3(s) => s.download(from, cancel).await, + Self::AzureBlob(s) => s.download(from, cancel).await, + Self::Unreliable(s) => s.download(from, cancel).await, } } @@ -339,71 +353,126 @@ impl GenericRemoteStorage { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { match self { Self::LocalFs(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } Self::AwsS3(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } Self::AzureBlob(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } Self::Unreliable(s) => { - s.download_byte_range(from, start_inclusive, end_exclusive) + s.download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } } } - pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + /// See [`RemoteStorage::delete`] + pub async fn delete( + &self, + path: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.delete(path).await, - Self::AwsS3(s) => s.delete(path).await, - Self::AzureBlob(s) => s.delete(path).await, - Self::Unreliable(s) => s.delete(path).await, + Self::LocalFs(s) => s.delete(path, cancel).await, + Self::AwsS3(s) => s.delete(path, cancel).await, + Self::AzureBlob(s) => s.delete(path, cancel).await, + Self::Unreliable(s) => s.delete(path, cancel).await, } } - pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + /// See [`RemoteStorage::delete_objects`] + pub async fn delete_objects( + &self, + paths: &[RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.delete_objects(paths).await, - Self::AwsS3(s) => s.delete_objects(paths).await, - Self::AzureBlob(s) => s.delete_objects(paths).await, - Self::Unreliable(s) => s.delete_objects(paths).await, + Self::LocalFs(s) => s.delete_objects(paths, cancel).await, + Self::AwsS3(s) => s.delete_objects(paths, cancel).await, + Self::AzureBlob(s) => s.delete_objects(paths, cancel).await, + Self::Unreliable(s) => s.delete_objects(paths, cancel).await, } } - pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + /// See [`RemoteStorage::copy`] + pub async fn copy_object( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { match self { - Self::LocalFs(s) => s.copy(from, to).await, - Self::AwsS3(s) => s.copy(from, to).await, - Self::AzureBlob(s) => s.copy(from, to).await, - Self::Unreliable(s) => s.copy(from, to).await, + Self::LocalFs(s) => s.copy(from, to, cancel).await, + Self::AwsS3(s) => s.copy(from, to, cancel).await, + Self::AzureBlob(s) => s.copy(from, to, cancel).await, + Self::Unreliable(s) => s.copy(from, to, cancel).await, + } + } + + /// See [`RemoteStorage::time_travel_recover`]. + pub async fn time_travel_recover( + &self, + prefix: Option<&RemotePath>, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { + match self { + Self::LocalFs(s) => { + s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await + } + Self::AwsS3(s) => { + s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await + } + Self::AzureBlob(s) => { + s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await + } + Self::Unreliable(s) => { + s.time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await + } } } } impl GenericRemoteStorage { pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { + let timeout = storage_config.timeout; Ok(match &storage_config.storage { - RemoteStorageKind::LocalFs(root) => { - info!("Using fs root '{root}' as a remote storage"); - Self::LocalFs(LocalFs::new(root.clone())?) + RemoteStorageKind::LocalFs(path) => { + info!("Using fs root '{path}' as a remote storage"); + Self::LocalFs(LocalFs::new(path.clone(), timeout)?) } RemoteStorageKind::AwsS3(s3_config) => { - info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'", + // The profile and access key id are only printed here for debugging purposes, + // their values don't indicate the eventually taken choice for auth. + let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "".into()); + let access_key_id = + std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "".into()); + info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}", s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); - Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?)) + Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?)) } RemoteStorageKind::AzureContainer(azure_config) => { - info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'", + let storage_account = azure_config + .storage_account + .as_deref() + .unwrap_or(""); + info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); - Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?)) + Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?)) } }) } @@ -412,18 +481,15 @@ impl GenericRemoteStorage { Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first))) } - /// Takes storage object contents and its size and uploads to remote storage, - /// mapping `from_path` to the corresponding remote object id in the storage. - /// - /// The storage object does not have to be present on the `from_path`, - /// this path is used for the remote object id conversion only. + /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata. pub async fn upload_storage_object( &self, from: impl Stream> + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, + cancel: &CancellationToken, ) -> anyhow::Result<()> { - self.upload(from, from_size_bytes, to, None) + self.upload(from, from_size_bytes, to, None, cancel) .await .with_context(|| { format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}") @@ -436,10 +502,11 @@ impl GenericRemoteStorage { &self, byte_range: Option<(u64, Option)>, from: &RemotePath, + cancel: &CancellationToken, ) -> Result { match byte_range { - Some((start, end)) => self.download_byte_range(from, start, end).await, - None => self.download(from).await, + Some((start, end)) => self.download_byte_range(from, start, end, cancel).await, + None => self.download(from, cancel).await, } } } @@ -449,11 +516,24 @@ impl GenericRemoteStorage { #[derive(Debug, Clone, PartialEq, Eq)] pub struct StorageMetadata(HashMap); +impl From<[(&str, &str); N]> for StorageMetadata { + fn from(arr: [(&str, &str); N]) -> Self { + let map: HashMap = arr + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + Self(map) + } +} + /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { /// The storage connection configuration. pub storage: RemoteStorageKind, + /// A common timeout enforced for all requests after concurrency limiter permit has been + /// acquired. + pub timeout: Duration, } /// A kind of a remote storage to connect to, with its connection configuration. @@ -490,6 +570,7 @@ pub struct S3Config { /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. pub concurrency_limit: NonZeroUsize, pub max_keys_per_list_response: Option, + pub upload_storage_class: Option, } impl Debug for S3Config { @@ -512,6 +593,8 @@ impl Debug for S3Config { pub struct AzureConfig { /// Name of the container to connect to. pub container_name: String, + /// Name of the storage account the container is inside of + pub storage_account: Option, /// The region where the bucket is located at. pub container_region: String, /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once. @@ -526,8 +609,9 @@ impl Debug for AzureConfig { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("AzureConfig") .field("bucket_name", &self.container_name) + .field("storage_account", &self.storage_account) .field("bucket_region", &self.container_region) - .field("prefix_in_bucket", &self.prefix_in_container) + .field("prefix_in_container", &self.prefix_in_container) .field("concurrency_limit", &self.concurrency_limit) .field( "max_keys_per_list_response", @@ -538,6 +622,8 @@ impl Debug for AzureConfig { } impl RemoteStorageConfig { + pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { let local_path = toml.get("local_path"); let bucket_name = toml.get("bucket_name"); @@ -567,6 +653,27 @@ impl RemoteStorageConfig { .map(|endpoint| parse_toml_string("endpoint", endpoint)) .transpose()?; + let timeout = toml + .get("timeout") + .map(|timeout| { + timeout + .as_str() + .ok_or_else(|| anyhow::Error::msg("timeout was not a string")) + }) + .transpose() + .and_then(|timeout| { + timeout + .map(humantime::parse_duration) + .transpose() + .map_err(anyhow::Error::new) + }) + .context("parse timeout")? + .unwrap_or(Self::DEFAULT_TIMEOUT); + + if timeout < Duration::from_secs(1) { + bail!("timeout was specified as {timeout:?} which is too low"); + } + let storage = match ( local_path, bucket_name, @@ -595,6 +702,18 @@ impl RemoteStorageConfig { endpoint, concurrency_limit, max_keys_per_list_response, + upload_storage_class: toml + .get("upload_storage_class") + .map(|prefix_in_bucket| -> anyhow::Result<_> { + let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?; + let storage_class = StorageClass::from_str(&s).expect("infallible"); + #[allow(deprecated)] + if matches!(storage_class, StorageClass::Unknown(_)) { + bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values()); + } + Ok(storage_class) + }) + .transpose()?, }) } (_, _, _, Some(_), None) => { @@ -606,6 +725,12 @@ impl RemoteStorageConfig { (None, None, None, Some(container_name), Some(container_region)) => { RemoteStorageKind::AzureContainer(AzureConfig { container_name: parse_toml_string("container_name", container_name)?, + storage_account: toml + .get("storage_account") + .map(|storage_account| { + parse_toml_string("storage_account", storage_account) + }) + .transpose()?, container_region: parse_toml_string("container_region", container_region)?, prefix_in_container: toml .get("prefix_in_container") @@ -628,7 +753,7 @@ impl RemoteStorageConfig { } }; - Ok(Some(RemoteStorageConfig { storage })) + Ok(Some(RemoteStorageConfig { storage, timeout })) } } @@ -673,6 +798,7 @@ impl ConcurrencyLimiter { RequestKind::List => &self.read, RequestKind::Delete => &self.write, RequestKind::Copy => &self.write, + RequestKind::TimeTravel => &self.write, } } @@ -723,4 +849,24 @@ mod tests { let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths"); assert_eq!(err.to_string(), "Path \"/\" is not relative"); } + + #[test] + fn parse_localfs_config_with_timeout() { + let input = "local_path = '.' +timeout = '5s'"; + + let toml = input.parse::().unwrap(); + + let config = RemoteStorageConfig::from_toml(toml.as_item()) + .unwrap() + .expect("it exists"); + + assert_eq!( + config, + RemoteStorageConfig { + storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")), + timeout: Duration::from_secs(5) + } + ); + } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index bf8b6b5dde..1f7bcfc982 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -4,7 +4,12 @@ //! This storage used in tests, but can also be used in cases when a certain persistent //! volume is mounted to the local FS. -use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin}; +use std::{ + collections::HashSet, + io::ErrorKind, + num::NonZeroU32, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; use anyhow::{bail, ensure, Context}; use bytes::Bytes; @@ -14,25 +19,29 @@ use tokio::{ fs, io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}, }; -use tokio_util::io::ReaderStream; -use tracing::*; -use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty}; +use tokio_util::{io::ReaderStream, sync::CancellationToken}; +use utils::crashsafe::path_with_suffix_extension; -use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath}; +use crate::{ + Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel, + REMOTE_STORAGE_PREFIX_SEPARATOR, +}; use super::{RemoteStorage, StorageMetadata}; +use crate::Etag; const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp"; #[derive(Debug, Clone)] pub struct LocalFs { storage_root: Utf8PathBuf, + timeout: Duration, } impl LocalFs { /// Attempts to create local FS storage, along with its root directory. /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative). - pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result { + pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result { if !storage_root.exists() { std::fs::create_dir_all(&storage_root).with_context(|| { format!("Failed to create all directories in the given root path {storage_root:?}") @@ -44,7 +53,10 @@ impl LocalFs { })?; } - Ok(Self { storage_root }) + Ok(Self { + storage_root, + timeout, + }) } // mirrors S3Bucket::s3_object_to_relative_path @@ -79,7 +91,47 @@ impl LocalFs { #[cfg(test)] async fn list_all(&self) -> anyhow::Result> { - Ok(get_all_files(&self.storage_root, true) + use std::{future::Future, pin::Pin}; + fn get_all_files<'a, P>( + directory_path: P, + ) -> Pin>> + Send + Sync + 'a>> + where + P: AsRef + Send + Sync + 'a, + { + Box::pin(async move { + let directory_path = directory_path.as_ref(); + if directory_path.exists() { + if directory_path.is_dir() { + let mut paths = Vec::new(); + let mut dir_contents = fs::read_dir(directory_path).await?; + while let Some(dir_entry) = dir_contents.next_entry().await? { + let file_type = dir_entry.file_type().await?; + let entry_path = + Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| { + anyhow::Error::msg(format!( + "non-Unicode path: {}", + pb.to_string_lossy() + )) + })?; + if file_type.is_symlink() { + tracing::debug!("{entry_path:?} is a symlink, skipping") + } else if file_type.is_dir() { + paths.extend(get_all_files(&entry_path).await?.into_iter()) + } else { + paths.push(entry_path); + } + } + Ok(paths) + } else { + bail!("Path {directory_path:?} is not a directory") + } + } else { + Ok(Vec::new()) + } + }) + } + + Ok(get_all_files(&self.storage_root) .await? .into_iter() .map(|path| { @@ -106,6 +158,14 @@ impl LocalFs { // S3 object list prefixes can be arbitrary strings, but when reading // the local filesystem we need a directory to start calling read_dir on. let mut initial_dir = full_path.clone(); + + // If there's no trailing slash, we have to start looking from one above: even if + // `initial_dir` is a directory, we should still list any prefixes in the parent + // that start with the same string. + if !full_path.to_string().ends_with('/') { + initial_dir.pop(); + } + loop { // Did we make it to the root? if initial_dir.parent().is_none() { @@ -155,77 +215,14 @@ impl LocalFs { Ok(files) } -} -#[async_trait::async_trait] -impl RemoteStorage for LocalFs { - async fn list( - &self, - prefix: Option<&RemotePath>, - mode: ListingMode, - ) -> Result { - let mut result = Listing::default(); - - if let ListingMode::NoDelimiter = mode { - let keys = self - .list_recursive(prefix) - .await - .map_err(DownloadError::Other)?; - - result.keys = keys - .into_iter() - .filter(|k| { - let path = k.with_base(&self.storage_root); - !path.is_dir() - }) - .collect(); - - return Ok(result); - } - - let path = match prefix { - Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)), - None => Cow::Borrowed(&self.storage_root), - }; - - let prefixes_to_filter = get_all_files(path.as_ref(), false) - .await - .map_err(DownloadError::Other)?; - - // filter out empty directories to mirror s3 behavior. - for prefix in prefixes_to_filter { - if prefix.is_dir() - && is_directory_empty(&prefix) - .await - .map_err(DownloadError::Other)? - { - continue; - } - - let stripped = prefix - .strip_prefix(&self.storage_root) - .context("Failed to strip prefix") - .and_then(RemotePath::new) - .expect( - "We list files for storage root, hence should be able to remote the prefix", - ); - - if prefix.is_dir() { - result.prefixes.push(stripped); - } else { - result.keys.push(stripped); - } - } - - Ok(result) - } - - async fn upload( + async fn upload0( &self, data: impl Stream> + Send + Sync, data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { let target_file_path = to.with_base(&self.storage_root); create_target_directory(&target_file_path).await?; @@ -247,6 +244,7 @@ impl RemoteStorage for LocalFs { fs::OpenOptions::new() .write(true) .create(true) + .truncate(true) .open(&temp_file_path) .await .with_context(|| { @@ -260,9 +258,26 @@ impl RemoteStorage for LocalFs { let mut buffer_to_read = data.take(from_size_bytes); // alternatively we could just write the bytes to a file, but local_fs is a testing utility - let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination) - .await - .with_context(|| { + let copy = io::copy_buf(&mut buffer_to_read, &mut destination); + + let bytes_read = tokio::select! { + biased; + _ = cancel.cancelled() => { + let file = destination.into_inner(); + // wait for the inflight operation(s) to complete so that there could be a next + // attempt right away and our writes are not directed to their file. + file.into_std().await; + + // TODO: leave the temp or not? leaving is probably less racy. enabled truncate at + // least. + fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?; + return Err(TimeoutOrCancel::Cancel.into()); + } + read = copy => read, + }; + + let bytes_read = + bytes_read.with_context(|| { format!( "Failed to upload file (write temp) to the local storage at '{temp_file_path}'", ) @@ -294,6 +309,9 @@ impl RemoteStorage for LocalFs { })?; if let Some(storage_metadata) = metadata { + // FIXME: we must not be using metadata much, since this would forget the old metadata + // for new writes? or perhaps metadata is sticky; could consider removing if it's never + // used. let storage_metadata_path = storage_metadata_path(&target_file_path); fs::write( &storage_metadata_path, @@ -310,41 +328,176 @@ impl RemoteStorage for LocalFs { Ok(()) } +} - async fn download(&self, from: &RemotePath) -> Result { - let target_path = from.with_base(&self.storage_root); - if file_exists(&target_path).map_err(DownloadError::BadInput)? { - let source = ReaderStream::new( - fs::OpenOptions::new() - .read(true) - .open(&target_path) - .await - .with_context(|| { - format!("Failed to open source file {target_path:?} to use in the download") - }) - .map_err(DownloadError::Other)?, - ); +impl RemoteStorage for LocalFs { + async fn list( + &self, + prefix: Option<&RemotePath>, + mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, + ) -> Result { + let op = async { + let mut result = Listing::default(); - let metadata = self - .read_storage_metadata(&target_path) + // Filter out directories: in S3 directories don't exist, only the keys within them do. + let keys = self + .list_recursive(prefix) .await .map_err(DownloadError::Other)?; - Ok(Download { - metadata, - last_modified: None, - etag: None, - download_stream: Box::pin(source), - }) - } else { - Err(DownloadError::NotFound) + let keys = keys + .into_iter() + .filter(|k| { + let path = k.with_base(&self.storage_root); + !path.is_dir() + }) + .collect(); + + if let ListingMode::NoDelimiter = mode { + result.keys = keys; + } else { + let mut prefixes = HashSet::new(); + for key in keys { + // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`. + let relative_key = if let Some(prefix) = prefix { + let mut prefix = prefix.clone(); + // We only strip the dirname of the prefix, so that when we strip it from the start of keys we + // end up with full file/dir names. + let prefix_full_local_path = prefix.with_base(&self.storage_root); + let has_slash = prefix.0.to_string().ends_with('/'); + let strip_prefix = if prefix_full_local_path.is_dir() && has_slash { + prefix + } else { + prefix.0.pop(); + prefix + }; + + RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap() + } else { + key + }; + + let relative_key = format!("{}", relative_key); + if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) { + let first_part = relative_key + .split(REMOTE_STORAGE_PREFIX_SEPARATOR) + .next() + .unwrap() + .to_owned(); + prefixes.insert(first_part); + } else { + result + .keys + .push(RemotePath::from_string(&relative_key).unwrap()); + } + } + result.prefixes = prefixes + .into_iter() + .map(|s| RemotePath::from_string(&s).unwrap()) + .collect(); + } + + if let Some(max_keys) = max_keys { + result.keys.truncate(max_keys.get() as usize); + } + Ok(result) + }; + + let timeout = async { + tokio::time::sleep(self.timeout).await; + Err(DownloadError::Timeout) + }; + + let cancelled = async { + cancel.cancelled().await; + Err(DownloadError::Cancelled) + }; + + tokio::select! { + res = op => res, + res = timeout => res, + res = cancelled => res, } } + async fn upload( + &self, + data: impl Stream> + Send + Sync, + data_size_bytes: usize, + to: &RemotePath, + metadata: Option, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let cancel = cancel.child_token(); + + let op = self.upload0(data, data_size_bytes, to, metadata, &cancel); + let mut op = std::pin::pin!(op); + + // race the upload0 to the timeout; if it goes over, do a graceful shutdown + let (res, timeout) = tokio::select! { + res = &mut op => (res, false), + _ = tokio::time::sleep(self.timeout) => { + cancel.cancel(); + (op.await, true) + } + }; + + match res { + Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => { + // we caused this cancel (or they happened simultaneously) -- swap it out to + // Timeout + Err(TimeoutOrCancel::Timeout.into()) + } + res => res, + } + } + + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + let target_path = from.with_base(&self.storage_root); + + let file_metadata = file_metadata(&target_path).await?; + + let source = ReaderStream::new( + fs::OpenOptions::new() + .read(true) + .open(&target_path) + .await + .with_context(|| { + format!("Failed to open source file {target_path:?} to use in the download") + }) + .map_err(DownloadError::Other)?, + ); + + let metadata = self + .read_storage_metadata(&target_path) + .await + .map_err(DownloadError::Other)?; + + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let source = crate::support::DownloadStream::new(cancel_or_timeout, source); + + let etag = mock_etag(&file_metadata); + Ok(Download { + metadata, + last_modified: file_metadata + .modified() + .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?, + etag, + download_stream: Box::pin(source), + }) + } + async fn download_byte_range( &self, from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { if let Some(end_exclusive) = end_exclusive { if end_exclusive <= start_inclusive { @@ -354,44 +507,54 @@ impl RemoteStorage for LocalFs { return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes"))); } } - let target_path = from.with_base(&self.storage_root); - if file_exists(&target_path).map_err(DownloadError::BadInput)? { - let mut source = tokio::fs::OpenOptions::new() - .read(true) - .open(&target_path) - .await - .with_context(|| { - format!("Failed to open source file {target_path:?} to use in the download") - }) - .map_err(DownloadError::Other)?; - source - .seek(io::SeekFrom::Start(start_inclusive)) - .await - .context("Failed to seek to the range start in a local storage file") - .map_err(DownloadError::Other)?; - let metadata = self - .read_storage_metadata(&target_path) - .await - .map_err(DownloadError::Other)?; - let download_stream: DownloadStream = match end_exclusive { - Some(end_exclusive) => Box::pin(ReaderStream::new( - source.take(end_exclusive - start_inclusive), - )), - None => Box::pin(ReaderStream::new(source)), - }; - Ok(Download { - metadata, - last_modified: None, - etag: None, - download_stream, + let target_path = from.with_base(&self.storage_root); + let file_metadata = file_metadata(&target_path).await?; + let mut source = tokio::fs::OpenOptions::new() + .read(true) + .open(&target_path) + .await + .with_context(|| { + format!("Failed to open source file {target_path:?} to use in the download") }) - } else { - Err(DownloadError::NotFound) - } + .map_err(DownloadError::Other)?; + + let len = source + .metadata() + .await + .context("query file length") + .map_err(DownloadError::Other)? + .len(); + + source + .seek(io::SeekFrom::Start(start_inclusive)) + .await + .context("Failed to seek to the range start in a local storage file") + .map_err(DownloadError::Other)?; + + let metadata = self + .read_storage_metadata(&target_path) + .await + .map_err(DownloadError::Other)?; + + let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive); + let source = ReaderStream::new(source); + + let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); + let source = crate::support::DownloadStream::new(cancel_or_timeout, source); + + let etag = mock_etag(&file_metadata); + Ok(Download { + metadata, + last_modified: file_metadata + .modified() + .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?, + etag, + download_stream: Box::pin(source), + }) } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> { let file_path = path.with_base(&self.storage_root); match fs::remove_file(&file_path).await { Ok(()) => Ok(()), @@ -403,14 +566,23 @@ impl RemoteStorage for LocalFs { } } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { for path in paths { - self.delete(path).await? + self.delete(path, cancel).await? } Ok(()) } - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + _cancel: &CancellationToken, + ) -> anyhow::Result<()> { let from_path = from.with_base(&self.storage_root); let to_path = to.with_base(&self.storage_root); create_target_directory(&to_path).await?; @@ -423,56 +595,22 @@ impl RemoteStorage for LocalFs { })?; Ok(()) } + + async fn time_travel_recover( + &self, + _prefix: Option<&RemotePath>, + _timestamp: SystemTime, + _done_if_after: SystemTime, + _cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { + Err(TimeTravelError::Unimplemented) + } } fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf { path_with_suffix_extension(original_path, "metadata") } -fn get_all_files<'a, P>( - directory_path: P, - recursive: bool, -) -> Pin>> + Send + Sync + 'a>> -where - P: AsRef + Send + Sync + 'a, -{ - Box::pin(async move { - let directory_path = directory_path.as_ref(); - if directory_path.exists() { - if directory_path.is_dir() { - let mut paths = Vec::new(); - let mut dir_contents = fs::read_dir(directory_path).await?; - while let Some(dir_entry) = dir_contents.next_entry().await? { - let file_type = dir_entry.file_type().await?; - let entry_path = - Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| { - anyhow::Error::msg(format!( - "non-Unicode path: {}", - pb.to_string_lossy() - )) - })?; - if file_type.is_symlink() { - debug!("{entry_path:?} is a symlink, skipping") - } else if file_type.is_dir() { - if recursive { - paths.extend(get_all_files(&entry_path, true).await?.into_iter()) - } else { - paths.push(entry_path) - } - } else { - paths.push(entry_path); - } - } - Ok(paths) - } else { - bail!("Path {directory_path:?} is not a directory") - } - } else { - Ok(Vec::new()) - } - }) -} - async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> { let target_dir = match target_file_path.parent() { Some(parent_dir) => parent_dir, @@ -484,33 +622,39 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result< Ok(()) } -fn file_exists(file_path: &Utf8Path) -> anyhow::Result { - if file_path.exists() { - ensure!(file_path.is_file(), "file path '{file_path}' is not a file"); - Ok(true) - } else { - Ok(false) - } +async fn file_metadata(file_path: &Utf8Path) -> Result { + tokio::fs::metadata(&file_path).await.map_err(|e| { + if e.kind() == ErrorKind::NotFound { + DownloadError::NotFound + } else { + DownloadError::BadInput(e.into()) + } + }) +} + +// Use mtime as stand-in for ETag. We could calculate a meaningful one by md5'ing the contents of files we +// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests +// quickly, with less overhead than using a mock S3 server. +fn mock_etag(meta: &std::fs::Metadata) -> Etag { + let mtime = meta.modified().expect("Filesystem mtime missing"); + format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into() } #[cfg(test)] mod fs_tests { use super::*; - use bytes::Bytes; use camino_tempfile::tempdir; - use futures_util::Stream; use std::{collections::HashMap, io::Write}; - async fn read_and_assert_remote_file_contents( + async fn read_and_check_metadata( storage: &LocalFs, - #[allow(clippy::ptr_arg)] - // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements remote_storage_path: &RemotePath, expected_metadata: Option<&StorageMetadata>, ) -> anyhow::Result { + let cancel = CancellationToken::new(); let download = storage - .download(remote_storage_path) + .download(remote_storage_path, &cancel) .await .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?; ensure!( @@ -525,16 +669,16 @@ mod fs_tests { #[tokio::test] async fn upload_file() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; - let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?; + let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?; assert_eq!( storage.list_all().await?, vec![target_path_1.clone()], "Should list a single file after first upload" ); - let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?; + let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?; assert_eq!( list_files_sorted(&storage).await?, vec![target_path_1.clone(), target_path_2.clone()], @@ -546,7 +690,7 @@ mod fs_tests { #[tokio::test] async fn upload_file_negatives() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let id = RemotePath::new(Utf8Path::new("dummy"))?; let content = Bytes::from_static(b"12345"); @@ -555,36 +699,36 @@ mod fs_tests { // Check that you get an error if the size parameter doesn't match the actual // size of the stream. storage - .upload(content(), 0, &id, None) + .upload(content(), 0, &id, None, &cancel) .await .expect_err("upload with zero size succeeded"); storage - .upload(content(), 4, &id, None) + .upload(content(), 4, &id, None, &cancel) .await .expect_err("upload with too short size succeeded"); storage - .upload(content(), 6, &id, None) + .upload(content(), 6, &id, None, &cancel) .await .expect_err("upload with too large size succeeded"); // Correct size is 5, this should succeed. - storage.upload(content(), 5, &id, None).await?; + storage.upload(content(), 5, &id, None, &cancel).await?; Ok(()) } - fn create_storage() -> anyhow::Result { + fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> { let storage_root = tempdir()?.path().to_path_buf(); - LocalFs::new(storage_root) + LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new())) } #[tokio::test] async fn download_file() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; - let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; + let contents = read_and_check_metadata(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), contents, @@ -592,7 +736,7 @@ mod fs_tests { ); let non_existing_path = "somewhere/else"; - match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await { + match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), } @@ -601,12 +745,12 @@ mod fs_tests { #[tokio::test] async fn download_file_range_positive() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; let full_range_download_contents = - read_and_assert_remote_file_contents(&storage, &upload_target, None).await?; + read_and_check_metadata(&storage, &upload_target, None).await?; assert_eq!( dummy_contents(upload_name), full_range_download_contents, @@ -617,7 +761,12 @@ mod fs_tests { let (first_part_local, second_part_local) = uploaded_bytes.split_at(3); let first_part_download = storage - .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) + .download_byte_range( + &upload_target, + 0, + Some(first_part_local.len() as u64), + &cancel, + ) .await?; assert!( first_part_download.metadata.is_none(), @@ -635,6 +784,7 @@ mod fs_tests { &upload_target, first_part_local.len() as u64, Some((first_part_local.len() + second_part_local.len()) as u64), + &cancel, ) .await?; assert!( @@ -648,14 +798,30 @@ mod fs_tests { "Second part bytes should be returned when requested" ); + let suffix_bytes = storage + .download_byte_range(&upload_target, 13, None, &cancel) + .await? + .download_stream; + let suffix_bytes = aggregate(suffix_bytes).await?; + let suffix = std::str::from_utf8(&suffix_bytes)?; + assert_eq!(upload_name, suffix); + + let all_bytes = storage + .download_byte_range(&upload_target, 0, None, &cancel) + .await? + .download_stream; + let all_bytes = aggregate(all_bytes).await?; + let all_bytes = std::str::from_utf8(&all_bytes)?; + assert_eq!(dummy_contents("upload_1"), all_bytes); + Ok(()) } #[tokio::test] async fn download_file_range_negative() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; let start = 1_000_000_000; let end = start + 1; @@ -664,6 +830,7 @@ mod fs_tests { &upload_target, start, Some(end), // exclusive end + &cancel, ) .await { @@ -680,7 +847,7 @@ mod fs_tests { let end = 234; assert!(start > end, "Should test an incorrect range"); match storage - .download_byte_range(&upload_target, start, Some(end)) + .download_byte_range(&upload_target, start, Some(end), &cancel) .await { Ok(_) => panic!("Should not allow downloading wrong ranges"), @@ -697,15 +864,15 @@ mod fs_tests { #[tokio::test] async fn delete_file() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; - let upload_target = upload_dummy_file(&storage, upload_name, None).await?; + let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?; - storage.delete(&upload_target).await?; + storage.delete(&upload_target, &cancel).await?; assert!(storage.list_all().await?.is_empty()); storage - .delete(&upload_target) + .delete(&upload_target, &cancel) .await .expect("Should allow deleting non-existing storage files"); @@ -714,17 +881,17 @@ mod fs_tests { #[tokio::test] async fn file_with_metadata() -> anyhow::Result<()> { - let storage = create_storage()?; + let (storage, cancel) = create_storage()?; let upload_name = "upload_1"; let metadata = StorageMetadata(HashMap::from([ ("one".to_string(), "1".to_string()), ("two".to_string(), "2".to_string()), ])); let upload_target = - upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?; + upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?; let full_range_download_contents = - read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?; + read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?; assert_eq!( dummy_contents(upload_name), full_range_download_contents, @@ -735,7 +902,12 @@ mod fs_tests { let (first_part_local, _) = uploaded_bytes.split_at(3); let partial_download_with_metadata = storage - .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64)) + .download_byte_range( + &upload_target, + 0, + Some(first_part_local.len() as u64), + &cancel, + ) .await?; let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?; assert_eq!( @@ -756,16 +928,25 @@ mod fs_tests { #[tokio::test] async fn list() -> anyhow::Result<()> { // No delimiter: should recursively list everything - let storage = create_storage()?; - let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?; - let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?; + let (storage, cancel) = create_storage()?; + let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?; + let child_sibling = + upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?; + let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?; - let listing = storage.list(None, ListingMode::NoDelimiter).await?; + let listing = storage + .list(None, ListingMode::NoDelimiter, None, &cancel) + .await?; assert!(listing.prefixes.is_empty()); - assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec()); + assert_eq!( + listing.keys.into_iter().collect::>(), + HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()]) + ); // Delimiter: should only go one deep - let listing = storage.list(None, ListingMode::WithDelimiter).await?; + let listing = storage + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await?; assert_eq!( listing.prefixes, @@ -773,19 +954,157 @@ mod fs_tests { ); assert!(listing.keys.is_empty()); - // Delimiter & prefix + // Delimiter & prefix with a trailing slash + let listing = storage + .list( + Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!( + listing.keys, + [RemotePath::from_string("uncle").unwrap()].to_vec() + ); + assert_eq!( + listing.prefixes, + [RemotePath::from_string("parent").unwrap()].to_vec() + ); + + // Delimiter and prefix without a trailing slash let listing = storage .list( Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()), ListingMode::WithDelimiter, + None, + &cancel, ) .await?; + assert_eq!(listing.keys, [].to_vec()); assert_eq!( listing.prefixes, - [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()] - .to_vec() + [RemotePath::from_string("grandparent").unwrap()].to_vec() ); - assert_eq!(listing.keys, [uncle.clone()].to_vec()); + + // Delimiter and prefix that's partway through a path component + let listing = storage + .list( + Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!(listing.keys, [].to_vec()); + assert_eq!( + listing.prefixes, + [RemotePath::from_string("grandparent").unwrap()].to_vec() + ); + + Ok(()) + } + + #[tokio::test] + async fn list_part_component() -> anyhow::Result<()> { + // No delimiter: should recursively list everything + let (storage, cancel) = create_storage()?; + + // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing + // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as + // a freeform prefix. + let _child_a = + upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?; + let _child_b = + upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?; + + // Delimiter and prefix that's partway through a path component + let listing = storage + .list( + Some( + &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(), + ), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await?; + assert_eq!(listing.keys, [].to_vec()); + + let mut found_prefixes = listing.prefixes.clone(); + found_prefixes.sort(); + assert_eq!( + found_prefixes, + [ + RemotePath::from_string("tenant").unwrap(), + RemotePath::from_string("tenant-01").unwrap(), + ] + .to_vec() + ); + + Ok(()) + } + + #[tokio::test] + async fn overwrite_shorter_file() -> anyhow::Result<()> { + let (storage, cancel) = create_storage()?; + + let path = RemotePath::new("does/not/matter/file".into())?; + + let body = Bytes::from_static(b"long file contents is long"); + { + let len = body.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); + storage.upload(body, len, &path, None, &cancel).await?; + } + + let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + assert_eq!(body, read); + + let shorter = Bytes::from_static(b"shorter body"); + { + let len = shorter.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone()))); + storage.upload(body, len, &path, None, &cancel).await?; + } + + let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + assert_eq!(shorter, read); + Ok(()) + } + + #[tokio::test] + async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> { + let (storage, cancel) = create_storage()?; + + let path = RemotePath::new("does/not/matter/file".into())?; + + let body = Bytes::from_static(b"long file contents is long"); + { + let len = body.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); + let cancel = cancel.child_token(); + cancel.cancel(); + let e = storage + .upload(body, len, &path, None, &cancel) + .await + .unwrap_err(); + + assert!(TimeoutOrCancel::caused_by_cancel(&e)); + } + + { + let len = body.len(); + let body = + futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone()))); + storage.upload(body, len, &path, None, &cancel).await?; + } + + let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + assert_eq!(body, read); Ok(()) } @@ -794,6 +1113,7 @@ mod fs_tests { storage: &LocalFs, name: &str, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result { let from_path = storage .storage_root @@ -815,7 +1135,9 @@ mod fs_tests { let file = tokio_util::io::ReaderStream::new(file); - storage.upload(file, size, &relative_path, metadata).await?; + storage + .upload(file, size, &relative_path, metadata, cancel) + .await?; Ok(relative_path) } diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/metrics.rs similarity index 73% rename from libs/remote_storage/src/s3_bucket/metrics.rs rename to libs/remote_storage/src/metrics.rs index 21dde14906..bbb51590f3 100644 --- a/libs/remote_storage/src/s3_bucket/metrics.rs +++ b/libs/remote_storage/src/metrics.rs @@ -12,8 +12,10 @@ pub(crate) enum RequestKind { Delete = 2, List = 3, Copy = 4, + TimeTravel = 5, } +use scopeguard::ScopeGuard; use RequestKind::*; impl RequestKind { @@ -24,6 +26,7 @@ impl RequestKind { Delete => "delete_object", List => "list_objects", Copy => "copy_object", + TimeTravel => "time_travel_recover", } } const fn as_index(&self) -> usize { @@ -31,17 +34,17 @@ impl RequestKind { } } -pub(super) struct RequestTyped([C; 5]); +pub(crate) struct RequestTyped([C; 6]); impl RequestTyped { - pub(super) fn get(&self, kind: RequestKind) -> &C { + pub(crate) fn get(&self, kind: RequestKind) -> &C { &self.0[kind.as_index()] } fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self { use RequestKind::*; - let mut it = [Get, Put, Delete, List, Copy].into_iter(); - let arr = std::array::from_fn::(|index| { + let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter(); + let arr = std::array::from_fn::(|index| { let next = it.next().unwrap(); assert_eq!(index, next.as_index()); f(next) @@ -56,19 +59,19 @@ impl RequestTyped { } impl RequestTyped { - pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) { + pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) { self.get(kind).observe(started_at.elapsed().as_secs_f64()) } } -pub(super) struct PassFailCancelledRequestTyped { +pub(crate) struct PassFailCancelledRequestTyped { success: RequestTyped, fail: RequestTyped, cancelled: RequestTyped, } #[derive(Debug, Clone, Copy)] -pub(super) enum AttemptOutcome { +pub(crate) enum AttemptOutcome { Ok, Err, Cancelled, @@ -84,7 +87,7 @@ impl From<&Result> for AttemptOutcome { } impl AttemptOutcome { - pub(super) fn as_str(&self) -> &'static str { + pub(crate) fn as_str(&self) -> &'static str { match self { AttemptOutcome::Ok => "ok", AttemptOutcome::Err => "err", @@ -94,7 +97,7 @@ impl AttemptOutcome { } impl PassFailCancelledRequestTyped { - pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C { + pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C { let target = match outcome { AttemptOutcome::Ok => &self.success, AttemptOutcome::Err => &self.fail, @@ -117,7 +120,7 @@ impl PassFailCancelledRequestTyped { } impl PassFailCancelledRequestTyped { - pub(super) fn observe_elapsed( + pub(crate) fn observe_elapsed( &self, kind: RequestKind, outcome: impl Into, @@ -128,19 +131,44 @@ impl PassFailCancelledRequestTyped { } } -pub(super) struct BucketMetrics { +/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`]. +pub(crate) fn start_counting_cancelled_wait( + kind: RequestKind, +) -> ScopeGuard { + scopeguard::guard_on_success(std::time::Instant::now(), move |_| { + crate::metrics::BUCKET_METRICS + .cancelled_waits + .get(kind) + .inc() + }) +} + +/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`]. +pub(crate) fn start_measuring_requests( + kind: RequestKind, +) -> ScopeGuard { + scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| { + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Cancelled, + started_at, + ) + }) +} + +pub(crate) struct BucketMetrics { /// Full request duration until successful completion, error or cancellation. - pub(super) req_seconds: PassFailCancelledRequestTyped, + pub(crate) req_seconds: PassFailCancelledRequestTyped, /// Total amount of seconds waited on queue. - pub(super) wait_seconds: RequestTyped, + pub(crate) wait_seconds: RequestTyped, /// Track how many semaphore awaits were cancelled per request type. /// /// This is in case cancellations are happening more than expected. - pub(super) cancelled_waits: RequestTyped, + pub(crate) cancelled_waits: RequestTyped, /// Total amount of deleted objects in batches or single requests. - pub(super) deleted_objects_total: IntCounter, + pub(crate) deleted_objects_total: IntCounter, } impl Default for BucketMetrics { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index d7b41edaaf..76cf3eac80 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -6,12 +6,15 @@ use std::{ borrow::Cow, + collections::HashMap, + num::NonZeroU32, pin::Pin, sync::Arc, task::{Context, Poll}, + time::{Duration, SystemTime}, }; -use anyhow::Context as _; +use anyhow::{anyhow, Context as _}; use aws_config::{ environment::credentials::EnvironmentVariableCredentialsProvider, imds::credentials::ImdsCredentialsProvider, @@ -24,31 +27,35 @@ use aws_config::{ }; use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::{ - config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, + config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, - types::{Delete, ObjectIdentifier}, + types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, Client, }; use aws_smithy_async::rt::sleep::TokioSleep; -use aws_smithy_types::body::SdkBody; -use aws_smithy_types::byte_stream::ByteStream; +use aws_smithy_types::{body::SdkBody, DateTime}; +use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; use bytes::Bytes; use futures::stream::Stream; use hyper::Body; use scopeguard::ScopeGuard; +use tokio_util::sync::CancellationToken; +use utils::backoff; use super::StorageMetadata; use crate::{ + error::Cancelled, + metrics::{start_counting_cancelled_wait, start_measuring_requests}, + support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, - S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, + S3Config, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, + REMOTE_STORAGE_PREFIX_SEPARATOR, }; -pub(super) mod metrics; - -use self::metrics::AttemptOutcome; -pub(super) use self::metrics::RequestKind; +use crate::metrics::AttemptOutcome; +pub(super) use crate::metrics::RequestKind; /// AWS S3 storage. pub struct S3Bucket { @@ -56,10 +63,12 @@ pub struct S3Bucket { bucket_name: String, prefix_in_bucket: Option, max_keys_per_list_response: Option, + upload_storage_class: Option, concurrency_limiter: ConcurrencyLimiter, + // Per-request timeout. Accessible for tests. + pub timeout: Duration, } -#[derive(Default)] struct GetObjectRequest { bucket: String, key: String, @@ -67,13 +76,13 @@ struct GetObjectRequest { } impl S3Bucket { /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided. - pub fn new(aws_config: &S3Config) -> anyhow::Result { + pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result { tracing::debug!( "Creating s3 remote storage for S3 bucket {}", - aws_config.bucket_name + remote_storage_config.bucket_name ); - let region = Some(Region::new(aws_config.bucket_region.clone())); + let region = Some(Region::new(remote_storage_config.bucket_region.clone())); let provider_conf = ProviderConfig::without_region().with_region(region.clone()); @@ -105,6 +114,38 @@ impl S3Bucket { // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off let sleep_impl: Arc = Arc::new(TokioSleep::new()); + let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) + .region(region) + .identity_cache(IdentityCache::lazy().build()) + .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + + let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| { + s.spawn(|| { + // TODO: make this function async. + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(sdk_config_loader.load()) + }) + .join() + .unwrap() + }); + + let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config); + + // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions. + // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future) + if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() { + s3_config_builder = s3_config_builder + .endpoint_url(custom_endpoint) + .force_path_style(true); + } + // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. @@ -112,41 +153,37 @@ impl S3Bucket { retry_config .set_max_attempts(Some(1)) .set_mode(Some(RetryMode::Adaptive)); + s3_config_builder = s3_config_builder.retry_config(retry_config.build()); - let mut config_builder = Builder::default() - .behavior_version(BehaviorVersion::v2023_11_09()) - .region(region) - .identity_cache(IdentityCache::lazy().build()) - .credentials_provider(SharedCredentialsProvider::new(credentials_provider)) - .retry_config(retry_config.build()) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)); + let s3_config = s3_config_builder.build(); + let client = aws_sdk_s3::Client::from_conf(s3_config); - if let Some(custom_endpoint) = aws_config.endpoint.clone() { - config_builder = config_builder - .endpoint_url(custom_endpoint) - .force_path_style(true); - } + let prefix_in_bucket = remote_storage_config + .prefix_in_bucket + .as_deref() + .map(|prefix| { + let mut prefix = prefix; + while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix = &prefix[1..] + } - let client = Client::from_conf(config_builder.build()); + let mut prefix = prefix.to_string(); + while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix.pop(); + } + prefix + }); - let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| { - let mut prefix = prefix; - while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix = &prefix[1..] - } - - let mut prefix = prefix.to_string(); - while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { - prefix.pop(); - } - prefix - }); Ok(Self { client, - bucket_name: aws_config.bucket_name.clone(), - max_keys_per_list_response: aws_config.max_keys_per_list_response, + bucket_name: remote_storage_config.bucket_name.clone(), + max_keys_per_list_response: remote_storage_config.max_keys_per_list_response, prefix_in_bucket, - concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), + concurrency_limiter: ConcurrencyLimiter::new( + remote_storage_config.concurrency_limit.get(), + ), + upload_storage_class: remote_storage_config.upload_storage_class.clone(), + timeout, }) } @@ -170,50 +207,62 @@ impl S3Bucket { pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); - let path_string = path - .get_path() - .as_str() - .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR); + let path_string = path.get_path().as_str(); match &self.prefix_in_bucket { Some(prefix) => prefix.clone() + "/" + path_string, None => path_string.to_string(), } } - async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> { + async fn permit( + &self, + kind: RequestKind, + cancel: &CancellationToken, + ) -> Result, Cancelled> { let started_at = start_counting_cancelled_wait(kind); - let permit = self - .concurrency_limiter - .acquire(kind) - .await - .expect("semaphore is never closed"); + let acquire = self.concurrency_limiter.acquire(kind); + + let permit = tokio::select! { + permit = acquire => permit.expect("semaphore is never closed"), + _ = cancel.cancelled() => return Err(Cancelled), + }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); - permit + Ok(permit) } - async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit { + async fn owned_permit( + &self, + kind: RequestKind, + cancel: &CancellationToken, + ) -> Result { let started_at = start_counting_cancelled_wait(kind); - let permit = self - .concurrency_limiter - .acquire_owned(kind) - .await - .expect("semaphore is never closed"); + let acquire = self.concurrency_limiter.acquire_owned(kind); + + let permit = tokio::select! { + permit = acquire => permit.expect("semaphore is never closed"), + _ = cancel.cancelled() => return Err(Cancelled), + }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .wait_seconds .observe_elapsed(kind, started_at); - permit + Ok(permit) } - async fn download_object(&self, request: GetObjectRequest) -> Result { + async fn download_object( + &self, + request: GetObjectRequest, + cancel: &CancellationToken, + ) -> Result { let kind = RequestKind::Get; - let permit = self.owned_permit(kind).await; + + let permit = self.owned_permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); @@ -223,52 +272,138 @@ impl S3Bucket { .bucket(request.bucket) .key(request.key) .set_range(request.range) - .send() - .await; + .send(); + + let get_object = tokio::select! { + res = get_object => res, + _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), + _ = cancel.cancelled() => return Err(DownloadError::Cancelled), + }; let started_at = ScopeGuard::into_inner(started_at); - match get_object { - Ok(object_output) => { - let metadata = object_output.metadata().cloned().map(StorageMetadata); - let etag = object_output.e_tag.clone(); - let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok()); - - let body = object_output.body; - let body = ByteStreamAsStream::from(body); - let body = PermitCarrying::new(permit, body); - let body = TimedDownload::new(started_at, body); - - Ok(Download { - metadata, - etag, - last_modified, - download_stream: Box::pin(body), - }) - } + let object_output = match get_object { + Ok(object_output) => object_output, Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { // Count this in the AttemptOutcome::Ok bucket, because 404 is not // an error: we expect to sometimes fetch an object and find it missing, // e.g. when probing for timeline indices. - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Ok, started_at, ); - Err(DownloadError::NotFound) + return Err(DownloadError::NotFound); } Err(e) => { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, AttemptOutcome::Err, started_at, ); - Err(DownloadError::Other( + return Err(DownloadError::Other( anyhow::Error::new(e).context("download s3 object"), - )) + )); + } + }; + + // even if we would have no timeout left, continue anyways. the caller can decide to ignore + // the errors considering timeouts and cancellation. + let remaining = self.timeout.saturating_sub(started_at.elapsed()); + + let metadata = object_output.metadata().cloned().map(StorageMetadata); + let etag = object_output + .e_tag + .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))? + .into(); + let last_modified = object_output + .last_modified + .ok_or(DownloadError::Other(anyhow::anyhow!( + "Missing LastModified header" + )))? + .try_into() + .map_err(|e: ConversionError| DownloadError::Other(e.into()))?; + + let body = object_output.body; + let body = ByteStreamAsStream::from(body); + let body = PermitCarrying::new(permit, body); + let body = TimedDownload::new(started_at, body); + + let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone()); + let body = crate::support::DownloadStream::new(cancel_or_timeout, body); + + Ok(Download { + metadata, + etag, + last_modified, + download_stream: Box::pin(body), + }) + } + + async fn delete_oids( + &self, + _permit: &tokio::sync::SemaphorePermit<'_>, + delete_objects: &[ObjectIdentifier], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let kind = RequestKind::Delete; + let mut cancel = std::pin::pin!(cancel.cancelled()); + + for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) { + let started_at = start_measuring_requests(kind); + + let req = self + .client + .delete_objects() + .bucket(self.bucket_name.clone()) + .delete( + Delete::builder() + .set_objects(Some(chunk.to_vec())) + .build() + .context("build request")?, + ) + .send(); + + let resp = tokio::select! { + resp = req => resp, + _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()), + _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()), + }; + + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &resp, started_at); + + let resp = resp.context("request deletion")?; + crate::metrics::BUCKET_METRICS + .deleted_objects_total + .inc_by(chunk.len() as u64); + + if let Some(errors) = resp.errors { + // Log a bounded number of the errors within the response: + // these requests can carry 1000 keys so logging each one + // would be too verbose, especially as errors may lead us + // to retry repeatedly. + const LOG_UP_TO_N_ERRORS: usize = 10; + for e in errors.iter().take(LOG_UP_TO_N_ERRORS) { + tracing::warn!( + "DeleteObjects key {} failed: {}: {}", + e.key.as_ref().map(Cow::from).unwrap_or("".into()), + e.code.as_ref().map(Cow::from).unwrap_or("".into()), + e.message.as_ref().map(Cow::from).unwrap_or("".into()) + ); + } + + return Err(anyhow::anyhow!( + "Failed to delete {}/{} objects", + errors.len(), + chunk.len(), + )); } } + Ok(()) } } @@ -297,45 +432,18 @@ impl Stream for ByteStreamAsStream { // sense and Stream::size_hint does not really } -pin_project_lite::pin_project! { - /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. - struct PermitCarrying { - permit: tokio::sync::OwnedSemaphorePermit, - #[pin] - inner: S, - } -} - -impl PermitCarrying { - fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { - Self { permit, inner } - } -} - -impl>> Stream for PermitCarrying { - type Item = ::Item; - - fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().inner.poll_next(cx) - } - - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } -} - pin_project_lite::pin_project! { /// Times and tracks the outcome of the request. struct TimedDownload { started_at: std::time::Instant, - outcome: metrics::AttemptOutcome, + outcome: AttemptOutcome, #[pin] inner: S } impl PinnedDrop for TimedDownload { fn drop(mut this: Pin<&mut Self>) { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at); + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at); } } } @@ -344,7 +452,7 @@ impl TimedDownload { fn new(started_at: std::time::Instant, inner: S) -> Self { TimedDownload { started_at, - outcome: metrics::AttemptOutcome::Cancelled, + outcome: AttemptOutcome::Cancelled, inner, } } @@ -361,8 +469,8 @@ impl>> Stream for TimedDownload { let res = ready!(this.inner.poll_next(cx)); match &res { Some(Ok(_)) => {} - Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err, - None => *this.outcome = metrics::AttemptOutcome::Ok, + Some(Err(_)) => *this.outcome = AttemptOutcome::Err, + None => *this.outcome = AttemptOutcome::Ok, } Poll::Ready(res) @@ -373,58 +481,70 @@ impl>> Stream for TimedDownload { } } -#[async_trait::async_trait] impl RemoteStorage for S3Bucket { async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, ) -> Result { let kind = RequestKind::List; + // s3 sdk wants i32 + let mut max_keys = max_keys.map(|mk| mk.get() as i32); let mut result = Listing::default(); // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| self.relative_path_to_s3_object(p)) - .or_else(|| self.prefix_in_bucket.clone()) - .map(|mut p| { - // required to end with a separator - // otherwise request will return only the entry of a prefix - if matches!(mode, ListingMode::WithDelimiter) - && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) - { - p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - } - p + .or_else(|| { + self.prefix_in_bucket.clone().map(|mut s| { + s.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + s + }) }); + let _permit = self.permit(kind, cancel).await?; + let mut continuation_token = None; loop { - let _guard = self.permit(kind).await; let started_at = start_measuring_requests(kind); + // min of two Options, returning Some if one is value and another is + // None (None is smaller than anything, so plain min doesn't work). + let request_max_keys = self + .max_keys_per_list_response + .into_iter() + .chain(max_keys.into_iter()) + .min(); let mut request = self .client .list_objects_v2() .bucket(self.bucket_name.clone()) .set_prefix(list_prefix.clone()) .set_continuation_token(continuation_token) - .set_max_keys(self.max_keys_per_list_response); + .set_max_keys(request_max_keys); if let ListingMode::WithDelimiter = mode { request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); } - let response = request - .send() - .await + let request = request.send(); + + let response = tokio::select! { + res = request => res, + _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout), + _ = cancel.cancelled() => return Err(DownloadError::Cancelled), + }; + + let response = response .context("Failed to list S3 prefixes") .map_err(DownloadError::Other); let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &response, started_at); @@ -440,13 +560,25 @@ impl RemoteStorage for S3Bucket { let object_path = object.key().expect("response does not contain a key"); let remote_path = self.s3_object_to_relative_path(object_path); result.keys.push(remote_path); + if let Some(mut mk) = max_keys { + assert!(mk > 0); + mk -= 1; + if mk == 0 { + return Ok(result); // limit reached + } + max_keys = Some(mk); + } } - result.prefixes.extend( - prefixes - .iter() - .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), - ); + // S3 gives us prefixes like "foo/", we return them like "foo" + result.prefixes.extend(prefixes.iter().filter_map(|o| { + Some( + self.s3_object_to_relative_path( + o.prefix()? + .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR), + ), + ) + })); continuation_token = match response.next_continuation_token { Some(new_token) => Some(new_token), @@ -463,39 +595,59 @@ impl RemoteStorage for S3Bucket { from_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { let kind = RequestKind::Put; - let _guard = self.permit(kind).await; + let _permit = self.permit(kind, cancel).await?; let started_at = start_measuring_requests(kind); let body = Body::wrap_stream(from); let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body)); - let res = self + let upload = self .client .put_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) .set_metadata(metadata.map(|m| m.0)) + .set_storage_class(self.upload_storage_class.clone()) .content_length(from_size_bytes.try_into()?) .body(bytes_stream) - .send() - .await; + .send(); - let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &res, started_at); + let upload = tokio::time::timeout(self.timeout, upload); - res?; + let res = tokio::select! { + res = upload => res, + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; - Ok(()) + if let Ok(inner) = &res { + // do not incl. timeouts as errors in metrics but cancellations + let started_at = ScopeGuard::into_inner(started_at); + crate::metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, inner, started_at); + } + + match res { + Ok(Ok(_put)) => Ok(()), + Ok(Err(sdk)) => Err(sdk.into()), + Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), + } } - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { let kind = RequestKind::Copy; - let _guard = self.permit(kind).await; + let _permit = self.permit(kind, cancel).await?; + + let timeout = tokio::time::sleep(self.timeout); let started_at = start_measuring_requests(kind); @@ -506,17 +658,23 @@ impl RemoteStorage for S3Bucket { self.relative_path_to_s3_object(from) ); - let res = self + let op = self .client .copy_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) + .set_storage_class(self.upload_storage_class.clone()) .copy_source(copy_source) - .send() - .await; + .send(); + + let res = tokio::select! { + res = op => res, + _ = timeout => return Err(TimeoutOrCancel::Timeout.into()), + _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()), + }; let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS + crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, &res, started_at); @@ -525,14 +683,21 @@ impl RemoteStorage for S3Bucket { Ok(()) } - async fn download(&self, from: &RemotePath) -> Result { + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { // if prefix is not none then download file `prefix/from` // if prefix is none then download file `from` - self.download_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: self.relative_path_to_s3_object(from), - range: None, - }) + self.download_object( + GetObjectRequest { + bucket: self.bucket_name.clone(), + key: self.relative_path_to_s3_object(from), + range: None, + }, + cancel, + ) .await } @@ -541,6 +706,7 @@ impl RemoteStorage for S3Bucket { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 // and needs both ends to be exclusive @@ -550,105 +716,322 @@ impl RemoteStorage for S3Bucket { None => format!("bytes={start_inclusive}-"), }); - self.download_object(GetObjectRequest { - bucket: self.bucket_name.clone(), - key: self.relative_path_to_s3_object(from), - range, - }) + self.download_object( + GetObjectRequest { + bucket: self.bucket_name.clone(), + key: self.relative_path_to_s3_object(from), + range, + }, + cancel, + ) .await } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { - let kind = RequestKind::Delete; - let _guard = self.permit(kind).await; + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let kind = RequestKind::Delete; + let permit = self.permit(kind, cancel).await?; let mut delete_objects = Vec::with_capacity(paths.len()); for path in paths { let obj_id = ObjectIdentifier::builder() .set_key(Some(self.relative_path_to_s3_object(path))) - .build()?; + .build() + .context("convert path to oid")?; delete_objects.push(obj_id); } - for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) { - let started_at = start_measuring_requests(kind); + self.delete_oids(&permit, &delete_objects, cancel).await + } - let resp = self - .client - .delete_objects() - .bucket(self.bucket_name.clone()) - .delete( - Delete::builder() - .set_objects(Some(chunk.to_vec())) - .build()?, - ) - .send() - .await; + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { + let paths = std::array::from_ref(path); + self.delete_objects(paths, cancel).await + } - let started_at = ScopeGuard::into_inner(started_at); - metrics::BUCKET_METRICS - .req_seconds - .observe_elapsed(kind, &resp, started_at); + async fn time_travel_recover( + &self, + prefix: Option<&RemotePath>, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { + let kind = RequestKind::TimeTravel; + let permit = self.permit(kind, cancel).await?; - match resp { - Ok(resp) => { - metrics::BUCKET_METRICS - .deleted_objects_total - .inc_by(chunk.len() as u64); - if let Some(errors) = resp.errors { - // Log a bounded number of the errors within the response: - // these requests can carry 1000 keys so logging each one - // would be too verbose, especially as errors may lead us - // to retry repeatedly. - const LOG_UP_TO_N_ERRORS: usize = 10; - for e in errors.iter().take(LOG_UP_TO_N_ERRORS) { - tracing::warn!( - "DeleteObjects key {} failed: {}: {}", - e.key.as_ref().map(Cow::from).unwrap_or("".into()), - e.code.as_ref().map(Cow::from).unwrap_or("".into()), - e.message.as_ref().map(Cow::from).unwrap_or("".into()) - ); - } + let timestamp = DateTime::from(timestamp); + let done_if_after = DateTime::from(done_if_after); - return Err(anyhow::format_err!( - "Failed to delete {} objects", - errors.len() - )); + tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}"); + + // get the passed prefix or if it is not set use prefix_in_bucket value + let prefix = prefix + .map(|p| self.relative_path_to_s3_object(p)) + .or_else(|| self.prefix_in_bucket.clone()); + + let warn_threshold = 3; + let max_retries = 10; + let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled); + + let mut key_marker = None; + let mut version_id_marker = None; + let mut versions_and_deletes = Vec::new(); + + loop { + let response = backoff::retry( + || async { + let op = self + .client + .list_object_versions() + .bucket(self.bucket_name.clone()) + .set_prefix(prefix.clone()) + .set_key_marker(key_marker.clone()) + .set_version_id_marker(version_id_marker.clone()) + .send(); + + tokio::select! { + res = op => res.map_err(|e| TimeTravelError::Other(e.into())), + _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), + } + }, + is_permanent, + warn_threshold, + max_retries, + "listing object versions for time_travel_recover", + cancel, + ) + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + + tracing::trace!( + " Got List response version_id_marker={:?}, key_marker={:?}", + response.version_id_marker, + response.key_marker + ); + let versions = response + .versions + .unwrap_or_default() + .into_iter() + .map(VerOrDelete::from_version); + let deletes = response + .delete_markers + .unwrap_or_default() + .into_iter() + .map(VerOrDelete::from_delete_marker); + itertools::process_results(versions.chain(deletes), |n_vds| { + versions_and_deletes.extend(n_vds) + }) + .map_err(TimeTravelError::Other)?; + fn none_if_empty(v: Option) -> Option { + v.filter(|v| !v.is_empty()) + } + version_id_marker = none_if_empty(response.next_version_id_marker); + key_marker = none_if_empty(response.next_key_marker); + if version_id_marker.is_none() { + // The final response is not supposed to be truncated + if response.is_truncated.unwrap_or_default() { + return Err(TimeTravelError::Other(anyhow::anyhow!( + "Received truncated ListObjectVersions response for prefix={prefix:?}" + ))); + } + break; + } + // Limit the number of versions deletions, mostly so that we don't + // keep requesting forever if the list is too long, as we'd put the + // list in RAM. + // Building a list of 100k entries that reaches the limit roughly takes + // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size. + const COMPLEXITY_LIMIT: usize = 100_000; + if versions_and_deletes.len() >= COMPLEXITY_LIMIT { + return Err(TimeTravelError::TooManyVersions); + } + } + + tracing::info!( + "Built list for time travel with {} versions and deletions", + versions_and_deletes.len() + ); + + // Work on the list of references instead of the objects directly, + // otherwise we get lifetime errors in the sort_by_key call below. + let mut versions_and_deletes = versions_and_deletes.iter().collect::>(); + + versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified)); + + let mut vds_for_key = HashMap::<_, Vec<_>>::new(); + + for vd in &versions_and_deletes { + let VerOrDelete { + version_id, key, .. + } = &vd; + if version_id == "null" { + return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \ + indicating either disabled versioning, or legacy objects with null version id values"))); + } + tracing::trace!( + "Parsing version key={key} version_id={version_id} kind={:?}", + vd.kind + ); + + vds_for_key.entry(key).or_default().push(vd); + } + for (key, versions) in vds_for_key { + let last_vd = versions.last().unwrap(); + if last_vd.last_modified > done_if_after { + tracing::trace!("Key {key} has version later than done_if_after, skipping"); + continue; + } + // the version we want to restore to. + let version_to_restore_to = + match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) { + Ok(v) => v, + Err(e) => e, + }; + if version_to_restore_to == versions.len() { + tracing::trace!("Key {key} has no changes since timestamp, skipping"); + continue; + } + let mut do_delete = false; + if version_to_restore_to == 0 { + // All versions more recent, so the key didn't exist at the specified time point. + tracing::trace!( + "All {} versions more recent for {key}, deleting", + versions.len() + ); + do_delete = true; + } else { + match &versions[version_to_restore_to - 1] { + VerOrDelete { + kind: VerOrDeleteKind::Version, + version_id, + .. + } => { + tracing::trace!("Copying old version {version_id} for {key}..."); + // Restore the state to the last version by copying + let source_id = + format!("{}/{key}?versionId={version_id}", self.bucket_name); + + backoff::retry( + || async { + let op = self + .client + .copy_object() + .bucket(self.bucket_name.clone()) + .key(key) + .set_storage_class(self.upload_storage_class.clone()) + .copy_source(&source_id) + .send(); + + tokio::select! { + res = op => res.map_err(|e| TimeTravelError::Other(e.into())), + _ = cancel.cancelled() => Err(TimeTravelError::Cancelled), + } + }, + is_permanent, + warn_threshold, + max_retries, + "copying object version for time_travel_recover", + cancel, + ) + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + tracing::info!(%version_id, %key, "Copied old version in S3"); + } + VerOrDelete { + kind: VerOrDeleteKind::DeleteMarker, + .. + } => { + do_delete = true; } } - Err(e) => { - return Err(e.into()); + }; + if do_delete { + if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) { + // Key has since been deleted (but there was some history), no need to do anything + tracing::trace!("Key {key} already deleted, skipping."); + } else { + tracing::trace!("Deleting {key}..."); + + let oid = ObjectIdentifier::builder() + .key(key.to_owned()) + .build() + .map_err(|e| TimeTravelError::Other(e.into()))?; + + self.delete_oids(&permit, &[oid], cancel) + .await + .map_err(|e| { + // delete_oid0 will use TimeoutOrCancel + if TimeoutOrCancel::caused_by_cancel(&e) { + TimeTravelError::Cancelled + } else { + TimeTravelError::Other(e) + } + })?; } } } Ok(()) } - - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { - let paths = std::array::from_ref(path); - self.delete_objects(paths).await - } } -/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`]. -fn start_counting_cancelled_wait( - kind: RequestKind, -) -> ScopeGuard { - scopeguard::guard_on_success(std::time::Instant::now(), move |_| { - metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc() - }) +// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry +struct VerOrDelete { + kind: VerOrDeleteKind, + last_modified: DateTime, + version_id: String, + key: String, } -/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`]. -fn start_measuring_requests( - kind: RequestKind, -) -> ScopeGuard { - scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| { - metrics::BUCKET_METRICS.req_seconds.observe_elapsed( +#[derive(Debug)] +enum VerOrDeleteKind { + Version, + DeleteMarker, +} + +impl VerOrDelete { + fn with_kind( + kind: VerOrDeleteKind, + last_modified: Option, + version_id: Option, + key: Option, + ) -> anyhow::Result { + let lvk = (last_modified, version_id, key); + let (Some(last_modified), Some(version_id), Some(key)) = lvk else { + anyhow::bail!( + "One (or more) of last_modified, key, and id is None. \ + Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}", + lvk.0, + lvk.1, + lvk.2, + ); + }; + Ok(Self { kind, - AttemptOutcome::Cancelled, - started_at, + last_modified, + version_id, + key, + }) + } + fn from_version(v: ObjectVersion) -> anyhow::Result { + Self::with_kind( + VerOrDeleteKind::Version, + v.last_modified, + v.version_id, + v.key, ) - }) + } + fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result { + Self::with_kind( + VerOrDeleteKind::DeleteMarker, + v.last_modified, + v.version_id, + v.key, + ) + } } #[cfg(test)] @@ -672,23 +1055,23 @@ mod tests { Some("test/prefix/"), Some("/test/prefix/"), ]; - let expected_outputs = vec![ - vec!["", "some/path", "some/path"], - vec!["/", "/some/path", "/some/path"], + let expected_outputs = [ + vec!["", "some/path", "some/path/"], + vec!["/", "/some/path", "/some/path/"], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], vec![ "test/prefix/", "test/prefix/some/path", - "test/prefix/some/path", + "test/prefix/some/path/", ], ]; @@ -700,8 +1083,10 @@ mod tests { endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response: Some(5), + upload_storage_class: None, }; - let storage = S3Bucket::new(&config).expect("remote storage init"); + let storage = + S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init"); for (test_path_idx, test_path) in all_paths.iter().enumerate() { let result = storage.relative_path_to_s3_object(test_path); let expected = expected_outputs[prefix_idx][test_path_idx]; diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index 7f5adcea30..c467a2d196 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -3,16 +3,20 @@ //! testing purposes. use bytes::Bytes; use futures::stream::Stream; -use std::collections::hash_map::Entry; use std::collections::HashMap; +use std::num::NonZeroU32; use std::sync::Mutex; +use std::time::SystemTime; +use std::{collections::hash_map::Entry, sync::Arc}; +use tokio_util::sync::CancellationToken; use crate::{ - Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata, + Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, + StorageMetadata, TimeTravelError, }; pub struct UnreliableWrapper { - inner: crate::GenericRemoteStorage, + inner: GenericRemoteStorage>, // This many attempts of each operation will fail, then we let it succeed. attempts_to_fail: u64, @@ -29,11 +33,21 @@ enum RemoteOp { Download(RemotePath), Delete(RemotePath), DeleteObjects(Vec), + TimeTravelRecover(Option), } impl UnreliableWrapper { pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self { assert!(attempts_to_fail > 0); + let inner = match inner { + GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s), + GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s), + GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s), + // We could also make this a no-op, as in, extract the inner of the passed generic remote storage + GenericRemoteStorage::Unreliable(_s) => { + panic!("Can't wrap unreliable wrapper unreliably") + } + }; UnreliableWrapper { inner, attempts_to_fail, @@ -47,7 +61,7 @@ impl UnreliableWrapper { /// On the first attempts of this operation, return an error. After 'attempts_to_fail' /// attempts, let the operation go ahead, and clear the counter. /// - fn attempt(&self, op: RemoteOp) -> Result { + fn attempt(&self, op: RemoteOp) -> anyhow::Result { let mut attempts = self.attempts.lock().unwrap(); match attempts.entry(op) { @@ -65,47 +79,44 @@ impl UnreliableWrapper { } else { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); - Err(DownloadError::Other(error)) + Err(error) } } Entry::Vacant(e) => { let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key()); e.insert(1); - Err(DownloadError::Other(error)) + Err(error) } } } - async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> { + async fn delete_inner( + &self, + path: &RemotePath, + attempt: bool, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { if attempt { self.attempt(RemoteOp::Delete(path.clone()))?; } - self.inner.delete(path).await + self.inner.delete(path, cancel).await } } -#[async_trait::async_trait] +// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage. +type VoidStorage = crate::LocalFs; + impl RemoteStorage for UnreliableWrapper { - async fn list_prefixes( - &self, - prefix: Option<&RemotePath>, - ) -> Result, DownloadError> { - self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?; - self.inner.list_prefixes(prefix).await - } - - async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { - self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?; - self.inner.list_files(folder).await - } - async fn list( &self, prefix: Option<&RemotePath>, mode: ListingMode, + max_keys: Option, + cancel: &CancellationToken, ) -> Result { - self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?; - self.inner.list(prefix, mode).await + self.attempt(RemoteOp::ListPrefixes(prefix.cloned())) + .map_err(DownloadError::Other)?; + self.inner.list(prefix, mode, max_keys, cancel).await } async fn upload( @@ -116,14 +127,22 @@ impl RemoteStorage for UnreliableWrapper { data_size_bytes: usize, to: &RemotePath, metadata: Option, + cancel: &CancellationToken, ) -> anyhow::Result<()> { self.attempt(RemoteOp::Upload(to.clone()))?; - self.inner.upload(data, data_size_bytes, to, metadata).await + self.inner + .upload(data, data_size_bytes, to, metadata, cancel) + .await } - async fn download(&self, from: &RemotePath) -> Result { - self.attempt(RemoteOp::Download(from.clone()))?; - self.inner.download(from).await + async fn download( + &self, + from: &RemotePath, + cancel: &CancellationToken, + ) -> Result { + self.attempt(RemoteOp::Download(from.clone())) + .map_err(DownloadError::Other)?; + self.inner.download(from, cancel).await } async fn download_byte_range( @@ -131,26 +150,32 @@ impl RemoteStorage for UnreliableWrapper { from: &RemotePath, start_inclusive: u64, end_exclusive: Option, + cancel: &CancellationToken, ) -> Result { // Note: We treat any download_byte_range as an "attempt" of the same // operation. We don't pay attention to the ranges. That's good enough // for now. - self.attempt(RemoteOp::Download(from.clone()))?; + self.attempt(RemoteOp::Download(from.clone())) + .map_err(DownloadError::Other)?; self.inner - .download_byte_range(from, start_inclusive, end_exclusive) + .download_byte_range(from, start_inclusive, end_exclusive, cancel) .await } - async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { - self.delete_inner(path, true).await + async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { + self.delete_inner(path, true, cancel).await } - async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + async fn delete_objects<'a>( + &self, + paths: &'a [RemotePath], + cancel: &CancellationToken, + ) -> anyhow::Result<()> { self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?; let mut error_counter = 0; for path in paths { // Dont record attempt because it was already recorded above - if (self.delete_inner(path, false).await).is_err() { + if (self.delete_inner(path, false, cancel).await).is_err() { error_counter += 1; } } @@ -163,10 +188,29 @@ impl RemoteStorage for UnreliableWrapper { Ok(()) } - async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> { + async fn copy( + &self, + from: &RemotePath, + to: &RemotePath, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { // copy is equivalent to download + upload self.attempt(RemoteOp::Download(from.clone()))?; self.attempt(RemoteOp::Upload(to.clone()))?; - self.inner.copy_object(from, to).await + self.inner.copy_object(from, to, cancel).await + } + + async fn time_travel_recover( + &self, + prefix: Option<&RemotePath>, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: &CancellationToken, + ) -> Result<(), TimeTravelError> { + self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned()))) + .map_err(TimeTravelError::Other)?; + self.inner + .time_travel_recover(prefix, timestamp, done_if_after, cancel) + .await } } diff --git a/libs/remote_storage/src/support.rs b/libs/remote_storage/src/support.rs new file mode 100644 index 0000000000..1ed9ed9305 --- /dev/null +++ b/libs/remote_storage/src/support.rs @@ -0,0 +1,215 @@ +use std::{ + future::Future, + pin::Pin, + task::{Context, Poll}, + time::Duration, +}; + +use bytes::Bytes; +use futures_util::Stream; +use tokio_util::sync::CancellationToken; + +use crate::TimeoutOrCancel; + +pin_project_lite::pin_project! { + /// An `AsyncRead` adapter which carries a permit for the lifetime of the value. + pub(crate) struct PermitCarrying { + permit: tokio::sync::OwnedSemaphorePermit, + #[pin] + inner: S, + } +} + +impl PermitCarrying { + pub(crate) fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self { + Self { permit, inner } + } +} + +impl Stream for PermitCarrying { + type Item = ::Item; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +pin_project_lite::pin_project! { + pub(crate) struct DownloadStream { + hit: bool, + #[pin] + cancellation: F, + #[pin] + inner: S, + } +} + +impl DownloadStream { + pub(crate) fn new(cancellation: F, inner: S) -> Self { + Self { + cancellation, + hit: false, + inner, + } + } +} + +/// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used. +impl Stream for DownloadStream +where + std::io::Error: From, + F: Future, + S: Stream>, +{ + type Item = ::Item; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + + if !*this.hit { + if let Poll::Ready(e) = this.cancellation.poll(cx) { + *this.hit = true; + + // most likely this will be a std::io::Error wrapping a DownloadError + let e = Err(std::io::Error::from(e)); + return Poll::Ready(Some(e)); + } + } else { + // this would be perfectly valid behaviour for doing a graceful completion on the + // download for example, but not one we expect to do right now. + tracing::warn!("continuing polling after having cancelled or timeouted"); + } + + this.inner.poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +/// Fires only on the first cancel or timeout, not on both. +pub(crate) fn cancel_or_timeout( + timeout: Duration, + cancel: CancellationToken, +) -> impl std::future::Future + 'static { + // futures are lazy, they don't do anything before being polled. + // + // "precalculate" the wanted deadline before returning the future, so that we can use pause + // failpoint to trigger a timeout in test. + let deadline = tokio::time::Instant::now() + timeout; + async move { + tokio::select! { + _ = tokio::time::sleep_until(deadline) => TimeoutOrCancel::Timeout, + _ = cancel.cancelled() => { + TimeoutOrCancel::Cancel + }, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::DownloadError; + use futures::stream::StreamExt; + + #[tokio::test(start_paused = true)] + async fn cancelled_download_stream() { + let inner = futures::stream::pending(); + let timeout = Duration::from_secs(120); + let cancel = CancellationToken::new(); + + let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); + let mut stream = std::pin::pin!(stream); + + let mut first = stream.next(); + + tokio::select! { + _ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"), + _ = tokio::time::sleep(Duration::from_secs(1)) => {}, + } + + cancel.cancel(); + + let e = first.await.expect("there must be some").unwrap_err(); + assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}"); + let inner = e.get_ref().expect("inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Cancelled)), + "{inner:?}" + ); + let e = DownloadError::from(e); + assert!(matches!(e, DownloadError::Cancelled), "{e:?}"); + + tokio::select! { + _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"), + _ = tokio::time::sleep(Duration::from_secs(121)) => {}, + } + } + + #[tokio::test(start_paused = true)] + async fn timeouted_download_stream() { + let inner = futures::stream::pending(); + let timeout = Duration::from_secs(120); + let cancel = CancellationToken::new(); + + let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); + let mut stream = std::pin::pin!(stream); + + // because the stream uses 120s timeout and we are paused, we advance to 120s right away. + let first = stream.next(); + + let e = first.await.expect("there must be some").unwrap_err(); + assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}"); + let inner = e.get_ref().expect("inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Timeout)), + "{inner:?}" + ); + let e = DownloadError::from(e); + assert!(matches!(e, DownloadError::Timeout), "{e:?}"); + + cancel.cancel(); + + tokio::select! { + _ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"), + _ = tokio::time::sleep(Duration::from_secs(121)) => {}, + } + } + + #[tokio::test] + async fn notified_but_pollable_after() { + let inner = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from_static( + b"hello world", + )))); + let timeout = Duration::from_secs(120); + let cancel = CancellationToken::new(); + + cancel.cancel(); + let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner); + let mut stream = std::pin::pin!(stream); + + let next = stream.next().await; + let ioe = next.unwrap().unwrap_err(); + assert!( + matches!( + ioe.get_ref().unwrap().downcast_ref::(), + Some(&DownloadError::Cancelled) + ), + "{ioe:?}" + ); + + let next = stream.next().await; + let bytes = next.unwrap().unwrap(); + assert_eq!(&b"hello world"[..], bytes); + } +} diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index bca117ed1a..da9dc08d8d 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -10,6 +10,7 @@ use futures::stream::Stream; use once_cell::sync::OnceCell; use remote_storage::{Download, GenericRemoteStorage, RemotePath}; use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; static LOGGING_DONE: OnceCell<()> = OnceCell::new(); @@ -58,8 +59,12 @@ pub(crate) async fn upload_simple_remote_data( ) -> ControlFlow, HashSet> { info!("Creating {upload_tasks_count} remote files"); let mut upload_tasks = JoinSet::new(); + let cancel = CancellationToken::new(); + for i in 1..upload_tasks_count + 1 { let task_client = Arc::clone(client); + let cancel = cancel.clone(); + upload_tasks.spawn(async move { let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); let blob_path = RemotePath::new( @@ -69,7 +74,9 @@ pub(crate) async fn upload_simple_remote_data( debug!("Creating remote item {i} at path {blob_path:?}"); let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, len, &blob_path, None).await?; + task_client + .upload(data, len, &blob_path, None, &cancel) + .await?; Ok::<_, anyhow::Error>(blob_path) }); @@ -107,13 +114,15 @@ pub(crate) async fn cleanup( "Removing {} objects from the remote storage during cleanup", objects_to_delete.len() ); + let cancel = CancellationToken::new(); let mut delete_tasks = JoinSet::new(); for object_to_delete in objects_to_delete { let task_client = Arc::clone(client); + let cancel = cancel.clone(); delete_tasks.spawn(async move { debug!("Deleting remote item at path {object_to_delete:?}"); task_client - .delete(&object_to_delete) + .delete(&object_to_delete, &cancel) .await .with_context(|| format!("{object_to_delete:?} removal")) }); @@ -141,8 +150,12 @@ pub(crate) async fn upload_remote_data( ) -> ControlFlow { info!("Creating {upload_tasks_count} remote files"); let mut upload_tasks = JoinSet::new(); + let cancel = CancellationToken::new(); + for i in 1..upload_tasks_count + 1 { let task_client = Arc::clone(client); + let cancel = cancel.clone(); + upload_tasks.spawn(async move { let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) @@ -152,7 +165,9 @@ pub(crate) async fn upload_remote_data( let (data, data_len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); - task_client.upload(data, data_len, &blob_path, None).await?; + task_client + .upload(data, data_len, &blob_path, None, &cancel) + .await?; Ok::<_, anyhow::Error>((blob_prefix, blob_path)) }); diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs new file mode 100644 index 0000000000..673151c8ef --- /dev/null +++ b/libs/remote_storage/tests/common/tests.rs @@ -0,0 +1,340 @@ +use anyhow::Context; +use camino::Utf8Path; +use remote_storage::ListingMode; +use remote_storage::RemotePath; +use std::sync::Arc; +use std::{collections::HashSet, num::NonZeroU32}; +use test_context::test_context; +use tokio_util::sync::CancellationToken; +use tracing::debug; + +use crate::common::{download_to_vec, upload_stream, wrap_stream}; + +use super::{ + MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs, +}; + +/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. +/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. +/// See the client creation in [`create_s3_client`] for details on the required env vars. +/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the +/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. +/// +/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`] +/// where +/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference +/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket +/// +/// Then, verifies that the client does return correct prefixes when queried: +/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only +/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` +/// +/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. +/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, +/// since current default AWS S3 pagination limit is 1000. +/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) +/// +/// Lastly, the test attempts to clean up and remove all uploaded S3 files. +/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. +#[test_context(MaybeEnabledStorageWithTestBlobs)] +#[tokio::test] +async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledStorageWithTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledStorageWithTestBlobs::Disabled => return Ok(()), + MaybeEnabledStorageWithTestBlobs::UploadsFailed(e, _) => { + anyhow::bail!("S3 init failed: {e:?}") + } + }; + + let cancel = CancellationToken::new(); + + let test_client = Arc::clone(&ctx.enabled.client); + let expected_remote_prefixes = ctx.remote_prefixes.clone(); + + let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix)) + .context("common_prefix construction")?; + let root_remote_prefixes = test_client + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await? + .prefixes + .into_iter() + .collect::>(); + assert_eq!( + root_remote_prefixes, HashSet::from([base_prefix.clone()]), + "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" + ); + + let nested_remote_prefixes = test_client + .list( + Some(&base_prefix.add_trailing_slash()), + ListingMode::WithDelimiter, + None, + &cancel, + ) + .await? + .prefixes + .into_iter() + .collect::>(); + let remote_only_prefixes = nested_remote_prefixes + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + + Ok(()) +} + +/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries. +/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set. +/// See `s3_pagination_should_work` for more information. +/// +/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] +/// Then performs the following queries: +/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` +/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` +#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)] +#[tokio::test] +async fn list_no_delimiter_works( + ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs, +) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()), + MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => { + anyhow::bail!("S3 init failed: {e:?}") + } + }; + let cancel = CancellationToken::new(); + let test_client = Arc::clone(&ctx.enabled.client); + let base_prefix = + RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?; + let root_files = test_client + .list(None, ListingMode::NoDelimiter, None, &cancel) + .await + .context("client list root files failure")? + .keys + .into_iter() + .collect::>(); + assert_eq!( + root_files, + ctx.remote_blobs.clone(), + "remote storage list on root mismatches with the uploads." + ); + + // Test that max_keys limit works. In total there are about 21 files (see + // upload_simple_remote_data call in test_real_s3.rs). + let limited_root_files = test_client + .list( + None, + ListingMode::NoDelimiter, + Some(NonZeroU32::new(2).unwrap()), + &cancel, + ) + .await + .context("client list root files failure")?; + assert_eq!(limited_root_files.keys.len(), 2); + + let nested_remote_files = test_client + .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel) + .await + .context("client list nested files failure")? + .keys + .into_iter() + .collect::>(); + let trim_remote_blobs: HashSet<_> = ctx + .remote_blobs + .iter() + .map(|x| x.get_path()) + .filter(|x| x.starts_with("folder1")) + .map(|x| RemotePath::new(x).expect("must be valid path")) + .collect(); + assert_eq!( + nested_remote_files, trim_remote_blobs, + "remote storage list on subdirrectory mismatches with the uploads." + ); + Ok(()) +} + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledStorage::Enabled(ctx) => ctx, + MaybeEnabledStorage::Disabled => return Ok(()), + }; + + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new( + format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(), + )) + .with_context(|| "RemotePath conversion")?; + + ctx.client + .delete(&path, &cancel) + .await + .expect("should succeed"); + + Ok(()) +} + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledStorage::Enabled(ctx) => ctx, + MaybeEnabledStorage::Disabled => return Ok(()), + }; + + let cancel = CancellationToken::new(); + + let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let (data, len) = upload_stream("remote blob data1".as_bytes().into()); + ctx.client.upload(data, len, &path1, None, &cancel).await?; + + let (data, len) = upload_stream("remote blob data2".as_bytes().into()); + ctx.client.upload(data, len, &path2, None, &cancel).await?; + + let (data, len) = upload_stream("remote blob data3".as_bytes().into()); + ctx.client.upload(data, len, &path3, None, &cancel).await?; + + ctx.client.delete_objects(&[path1, path2], &cancel).await?; + + let prefixes = ctx + .client + .list(None, ListingMode::WithDelimiter, None, &cancel) + .await? + .prefixes; + + assert_eq!(prefixes.len(), 1); + + ctx.client.delete_objects(&[path3], &cancel).await?; + + Ok(()) +} + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return Ok(()); + }; + + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let orig = bytes::Bytes::from_static("remote blob data here".as_bytes()); + + let (data, len) = wrap_stream(orig.clone()); + + ctx.client.upload(data, len, &path, None, &cancel).await?; + + // Normal download request + let dl = ctx.client.download(&path, &cancel).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + // Full range (end specified) + let dl = ctx + .client + .download_byte_range(&path, 0, Some(len as u64), &cancel) + .await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + // partial range (end specified) + let dl = ctx + .client + .download_byte_range(&path, 4, Some(10), &cancel) + .await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[4..10]); + + // partial range (end beyond real end) + let dl = ctx + .client + .download_byte_range(&path, 8, Some(len as u64 * 100), &cancel) + .await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[8..]); + + // Partial range (end unspecified) + let dl = ctx + .client + .download_byte_range(&path, 4, None, &cancel) + .await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig[4..]); + + // Full range (end unspecified) + let dl = ctx + .client + .download_byte_range(&path, 0, None, &cancel) + .await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + debug!("Cleanup: deleting file at path {path:?}"); + ctx.client + .delete(&path, &cancel) + .await + .with_context(|| format!("{path:?} removal"))?; + + Ok(()) +} + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return Ok(()); + }; + + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new( + format!("{}/file_to_copy", ctx.base_prefix).as_str(), + )) + .with_context(|| "RemotePath conversion")?; + let path_dest = RemotePath::new(Utf8Path::new( + format!("{}/file_dest", ctx.base_prefix).as_str(), + )) + .with_context(|| "RemotePath conversion")?; + + let orig = bytes::Bytes::from_static("remote blob data content".as_bytes()); + + let (data, len) = wrap_stream(orig.clone()); + + ctx.client.upload(data, len, &path, None, &cancel).await?; + + // Normal download request + ctx.client.copy_object(&path, &path_dest, &cancel).await?; + + let dl = ctx.client.download(&path_dest, &cancel).await?; + let buf = download_to_vec(dl).await?; + assert_eq!(&buf, &orig); + + debug!("Cleanup: deleting file at path {path:?}"); + ctx.client + .delete_objects(&[path.clone(), path_dest.clone()], &cancel) + .await + .with_context(|| format!("{path:?} removal"))?; + + Ok(()) +} diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 0387dc30e7..23628dfebe 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -1,268 +1,28 @@ -use std::collections::HashSet; use std::env; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; use std::time::UNIX_EPOCH; +use std::{collections::HashSet, time::Duration}; use anyhow::Context; -use camino::Utf8Path; use remote_storage::{ AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, }; -use test_context::{test_context, AsyncTestContext}; -use tracing::{debug, info}; +use test_context::AsyncTestContext; +use tracing::info; mod common; -use common::{ - cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data, - upload_stream, wrap_stream, -}; +#[path = "common/tests.rs"] +mod tests_azure; + +use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data}; const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE"; const BASE_PREFIX: &str = "test"; -/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries. -/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. -/// See the client creation in [`create_azure_client`] for details on the required env vars. -/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the -/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. -/// -/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`] -/// where -/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference -/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket -/// -/// Then, verifies that the client does return correct prefixes when queried: -/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only -/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` -/// -/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys. -/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure. -/// -/// Lastly, the test attempts to clean up and remove all uploaded Azure files. -/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. -#[test_context(MaybeEnabledAzureWithTestBlobs)] -#[tokio::test] -async fn azure_pagination_should_work( - ctx: &mut MaybeEnabledAzureWithTestBlobs, -) -> anyhow::Result<()> { - let ctx = match ctx { - MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx, - MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()), - MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => { - anyhow::bail!("Azure init failed: {e:?}") - } - }; - - let test_client = Arc::clone(&ctx.enabled.client); - let expected_remote_prefixes = ctx.remote_prefixes.clone(); - - let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix)) - .context("common_prefix construction")?; - let root_remote_prefixes = test_client - .list_prefixes(None) - .await - .context("client list root prefixes failure")? - .into_iter() - .collect::>(); - assert_eq!( - root_remote_prefixes, HashSet::from([base_prefix.clone()]), - "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" - ); - - let nested_remote_prefixes = test_client - .list_prefixes(Some(&base_prefix)) - .await - .context("client list nested prefixes failure")? - .into_iter() - .collect::>(); - let remote_only_prefixes = nested_remote_prefixes - .difference(&expected_remote_prefixes) - .collect::>(); - let missing_uploaded_prefixes = expected_remote_prefixes - .difference(&nested_remote_prefixes) - .collect::>(); - assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, - "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", - ); - - Ok(()) -} - -/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries. -/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set. -/// See `Azure_pagination_should_work` for more information. -/// -/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] -/// Then performs the following queries: -/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` -/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` -#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)] -#[tokio::test] -async fn azure_list_files_works( - ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs, -) -> anyhow::Result<()> { - let ctx = match ctx { - MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx, - MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()), - MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => { - anyhow::bail!("Azure init failed: {e:?}") - } - }; - let test_client = Arc::clone(&ctx.enabled.client); - let base_prefix = - RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?; - let root_files = test_client - .list_files(None) - .await - .context("client list root files failure")? - .into_iter() - .collect::>(); - assert_eq!( - root_files, - ctx.remote_blobs.clone(), - "remote storage list_files on root mismatches with the uploads." - ); - let nested_remote_files = test_client - .list_files(Some(&base_prefix)) - .await - .context("client list nested files failure")? - .into_iter() - .collect::>(); - let trim_remote_blobs: HashSet<_> = ctx - .remote_blobs - .iter() - .map(|x| x.get_path()) - .filter(|x| x.starts_with("folder1")) - .map(|x| RemotePath::new(x).expect("must be valid path")) - .collect(); - assert_eq!( - nested_remote_files, trim_remote_blobs, - "remote storage list_files on subdirrectory mismatches with the uploads." - ); - Ok(()) -} - -#[test_context(MaybeEnabledAzure)] -#[tokio::test] -async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> { - let ctx = match ctx { - MaybeEnabledAzure::Enabled(ctx) => ctx, - MaybeEnabledAzure::Disabled => return Ok(()), - }; - - let path = RemotePath::new(Utf8Path::new( - format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(), - )) - .with_context(|| "RemotePath conversion")?; - - ctx.client.delete(&path).await.expect("should succeed"); - - Ok(()) -} - -#[test_context(MaybeEnabledAzure)] -#[tokio::test] -async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> { - let ctx = match ctx { - MaybeEnabledAzure::Enabled(ctx) => ctx, - MaybeEnabledAzure::Disabled => return Ok(()), - }; - - let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) - .with_context(|| "RemotePath conversion")?; - - let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str())) - .with_context(|| "RemotePath conversion")?; - - let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) - .with_context(|| "RemotePath conversion")?; - - let (data, len) = upload_stream("remote blob data1".as_bytes().into()); - ctx.client.upload(data, len, &path1, None).await?; - - let (data, len) = upload_stream("remote blob data2".as_bytes().into()); - ctx.client.upload(data, len, &path2, None).await?; - - let (data, len) = upload_stream("remote blob data3".as_bytes().into()); - ctx.client.upload(data, len, &path3, None).await?; - - ctx.client.delete_objects(&[path1, path2]).await?; - - let prefixes = ctx.client.list_prefixes(None).await?; - - assert_eq!(prefixes.len(), 1); - - ctx.client.delete_objects(&[path3]).await?; - - Ok(()) -} - -#[test_context(MaybeEnabledAzure)] -#[tokio::test] -async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> { - let MaybeEnabledAzure::Enabled(ctx) = ctx else { - return Ok(()); - }; - - let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) - .with_context(|| "RemotePath conversion")?; - - let orig = bytes::Bytes::from_static("remote blob data here".as_bytes()); - - let (data, len) = wrap_stream(orig.clone()); - - ctx.client.upload(data, len, &path, None).await?; - - // Normal download request - let dl = ctx.client.download(&path).await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig); - - // Full range (end specified) - let dl = ctx - .client - .download_byte_range(&path, 0, Some(len as u64)) - .await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig); - - // partial range (end specified) - let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig[4..10]); - - // partial range (end beyond real end) - let dl = ctx - .client - .download_byte_range(&path, 8, Some(len as u64 * 100)) - .await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig[8..]); - - // Partial range (end unspecified) - let dl = ctx.client.download_byte_range(&path, 4, None).await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig[4..]); - - // Full range (end unspecified) - let dl = ctx.client.download_byte_range(&path, 0, None).await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig); - - debug!("Cleanup: deleting file at path {path:?}"); - ctx.client - .delete(&path) - .await - .with_context(|| format!("{path:?} removal"))?; - - Ok(()) -} - struct EnabledAzure { client: Arc, base_prefix: &'static str, @@ -279,15 +39,25 @@ impl EnabledAzure { base_prefix: BASE_PREFIX, } } + + #[allow(unused)] // this will be needed when moving the timeout integration tests back + fn configure_request_timeout(&mut self, timeout: Duration) { + match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") { + GenericRemoteStorage::AzureBlob(azure) => { + let azure = Arc::get_mut(azure).expect("inner Arc::get_mut"); + azure.timeout = timeout; + } + _ => unreachable!(), + } + } } -enum MaybeEnabledAzure { +enum MaybeEnabledStorage { Enabled(EnabledAzure), Disabled, } -#[async_trait::async_trait] -impl AsyncTestContext for MaybeEnabledAzure { +impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -303,7 +73,7 @@ impl AsyncTestContext for MaybeEnabledAzure { } } -enum MaybeEnabledAzureWithTestBlobs { +enum MaybeEnabledStorageWithTestBlobs { Enabled(AzureWithTestBlobs), Disabled, UploadsFailed(anyhow::Error, AzureWithTestBlobs), @@ -315,8 +85,7 @@ struct AzureWithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] -impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs { +impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { @@ -363,11 +132,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs { } } -// NOTE: the setups for the list_prefixes test and the list_files test are very similar -// However, they are not idential. The list_prefixes function is concerned with listing prefixes, -// whereas the list_files function is concerned with listing files. -// See `RemoteStorage::list_files` documentation for more details -enum MaybeEnabledAzureWithSimpleTestBlobs { +enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(AzureWithSimpleTestBlobs), Disabled, UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs), @@ -377,8 +142,7 @@ struct AzureWithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] -impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs { +impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { @@ -448,11 +212,13 @@ fn create_azure_client( let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::AzureContainer(AzureConfig { container_name: remote_storage_azure_container, + storage_account: None, container_region: remote_storage_azure_region, prefix_in_container: Some(format!("test_{millis}_{random:08x}/")), concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, }), + timeout: Duration::from_secs(120), }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 8f46b2abd6..a273abe867 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -1,173 +1,91 @@ -use std::collections::HashSet; use std::env; +use std::fmt::{Debug, Display}; +use std::future::Future; use std::num::NonZeroUsize; use std::ops::ControlFlow; use std::sync::Arc; -use std::time::UNIX_EPOCH; +use std::time::{Duration, UNIX_EPOCH}; +use std::{collections::HashSet, time::SystemTime}; +use crate::common::{download_to_vec, upload_stream}; use anyhow::Context; use camino::Utf8Path; +use futures_util::StreamExt; use remote_storage::{ - GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, + DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, + RemoteStorageKind, S3Config, }; -use test_context::{test_context, AsyncTestContext}; -use tracing::{debug, info}; +use test_context::test_context; +use test_context::AsyncTestContext; +use tokio::io::AsyncBufReadExt; +use tokio_util::sync::CancellationToken; +use tracing::info; mod common; -use common::{ - cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data, - upload_stream, wrap_stream, -}; +#[path = "common/tests.rs"] +mod tests_s3; + +use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data}; +use utils::backoff; const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; - const BASE_PREFIX: &str = "test"; -/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. -/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. -/// See the client creation in [`create_s3_client`] for details on the required env vars. -/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the -/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. -/// -/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`] -/// where -/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference -/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket -/// -/// Then, verifies that the client does return correct prefixes when queried: -/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only -/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` -/// -/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. -/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, -/// since current default AWS S3 pagination limit is 1000. -/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) -/// -/// Lastly, the test attempts to clean up and remove all uploaded S3 files. -/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. -#[test_context(MaybeEnabledS3WithTestBlobs)] +#[test_context(MaybeEnabledStorage)] #[tokio::test] -async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> { +async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { let ctx = match ctx { - MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx, - MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()), - MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"), + MaybeEnabledStorage::Enabled(ctx) => ctx, + MaybeEnabledStorage::Disabled => return Ok(()), }; + // Our test depends on discrepancies in the clock between S3 and the environment the tests + // run in. Therefore, wait a little bit before and after. The alternative would be + // to take the time from S3 response headers. + const WAIT_TIME: Duration = Duration::from_millis(3_000); - let test_client = Arc::clone(&ctx.enabled.client); - let expected_remote_prefixes = ctx.remote_prefixes.clone(); - - let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix)) - .context("common_prefix construction")?; - let root_remote_prefixes = test_client - .list_prefixes(None) + async fn retry(op: O) -> Result + where + E: Display + Debug + 'static, + O: FnMut() -> F, + F: Future>, + { + let warn_threshold = 3; + let max_retries = 10; + backoff::retry( + op, + |_e| false, + warn_threshold, + max_retries, + "test retry", + &CancellationToken::new(), + ) .await - .context("client list root prefixes failure")? - .into_iter() - .collect::>(); - assert_eq!( - root_remote_prefixes, HashSet::from([base_prefix.clone()]), - "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" - ); + .expect("never cancelled") + } - let nested_remote_prefixes = test_client - .list_prefixes(Some(&base_prefix)) - .await - .context("client list nested prefixes failure")? - .into_iter() - .collect::>(); - let remote_only_prefixes = nested_remote_prefixes - .difference(&expected_remote_prefixes) - .collect::>(); - let missing_uploaded_prefixes = expected_remote_prefixes - .difference(&nested_remote_prefixes) - .collect::>(); - assert_eq!( - remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, - "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", - ); + async fn time_point() -> SystemTime { + tokio::time::sleep(WAIT_TIME).await; + let ret = SystemTime::now(); + tokio::time::sleep(WAIT_TIME).await; + ret + } - Ok(()) -} + async fn list_files( + client: &Arc, + cancel: &CancellationToken, + ) -> anyhow::Result> { + Ok( + retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel)) + .await + .context("list root files failure")? + .keys + .into_iter() + .collect::>(), + ) + } -/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries. -/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set. -/// See `s3_pagination_should_work` for more information. -/// -/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`] -/// Then performs the following queries: -/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` -/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` -#[test_context(MaybeEnabledS3WithSimpleTestBlobs)] -#[tokio::test] -async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> { - let ctx = match ctx { - MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx, - MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()), - MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => { - anyhow::bail!("S3 init failed: {e:?}") - } - }; - let test_client = Arc::clone(&ctx.enabled.client); - let base_prefix = - RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?; - let root_files = test_client - .list_files(None) - .await - .context("client list root files failure")? - .into_iter() - .collect::>(); - assert_eq!( - root_files, - ctx.remote_blobs.clone(), - "remote storage list_files on root mismatches with the uploads." - ); - let nested_remote_files = test_client - .list_files(Some(&base_prefix)) - .await - .context("client list nested files failure")? - .into_iter() - .collect::>(); - let trim_remote_blobs: HashSet<_> = ctx - .remote_blobs - .iter() - .map(|x| x.get_path()) - .filter(|x| x.starts_with("folder1")) - .map(|x| RemotePath::new(x).expect("must be valid path")) - .collect(); - assert_eq!( - nested_remote_files, trim_remote_blobs, - "remote storage list_files on subdirrectory mismatches with the uploads." - ); - Ok(()) -} - -#[test_context(MaybeEnabledS3)] -#[tokio::test] -async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { - let ctx = match ctx { - MaybeEnabledS3::Enabled(ctx) => ctx, - MaybeEnabledS3::Disabled => return Ok(()), - }; - - let path = RemotePath::new(Utf8Path::new( - format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(), - )) - .with_context(|| "RemotePath conversion")?; - - ctx.client.delete(&path).await.expect("should succeed"); - - Ok(()) -} - -#[test_context(MaybeEnabledS3)] -#[tokio::test] -async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { - let ctx = match ctx { - MaybeEnabledS3::Enabled(ctx) => ctx, - MaybeEnabledS3::Disabled => return Ok(()), - }; + let cancel = CancellationToken::new(); let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; @@ -178,83 +96,95 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) .with_context(|| "RemotePath conversion")?; - let (data, len) = upload_stream("remote blob data1".as_bytes().into()); - ctx.client.upload(data, len, &path1, None).await?; + retry(|| { + let (data, len) = upload_stream("remote blob data1".as_bytes().into()); + ctx.client.upload(data, len, &path1, None, &cancel) + }) + .await?; - let (data, len) = upload_stream("remote blob data2".as_bytes().into()); - ctx.client.upload(data, len, &path2, None).await?; + let t0_files = list_files(&ctx.client, &cancel).await?; + let t0 = time_point().await; + println!("at t0: {t0_files:?}"); - let (data, len) = upload_stream("remote blob data3".as_bytes().into()); - ctx.client.upload(data, len, &path3, None).await?; + let old_data = "remote blob data2"; - ctx.client.delete_objects(&[path1, path2]).await?; + retry(|| { + let (data, len) = upload_stream(old_data.as_bytes().into()); + ctx.client.upload(data, len, &path2, None, &cancel) + }) + .await?; - let prefixes = ctx.client.list_prefixes(None).await?; + let t1_files = list_files(&ctx.client, &cancel).await?; + let t1 = time_point().await; + println!("at t1: {t1_files:?}"); - assert_eq!(prefixes.len(), 1); + // A little check to ensure that our clock is not too far off from the S3 clock + { + let dl = retry(|| ctx.client.download(&path2, &cancel)).await?; + let last_modified = dl.last_modified; + let half_wt = WAIT_TIME.mul_f32(0.5); + let t0_hwt = t0 + half_wt; + let t1_hwt = t1 - half_wt; + if !(t0_hwt..=t1_hwt).contains(&last_modified) { + panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \ + This likely means a large lock discrepancy between S3 and the local clock."); + } + } - ctx.client.delete_objects(&[path3]).await?; + retry(|| { + let (data, len) = upload_stream("remote blob data3".as_bytes().into()); + ctx.client.upload(data, len, &path3, None, &cancel) + }) + .await?; - Ok(()) -} + let new_data = "new remote blob data2"; -#[test_context(MaybeEnabledS3)] -#[tokio::test] -async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { - let MaybeEnabledS3::Enabled(ctx) = ctx else { - return Ok(()); - }; + retry(|| { + let (data, len) = upload_stream(new_data.as_bytes().into()); + ctx.client.upload(data, len, &path2, None, &cancel) + }) + .await?; - let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) - .with_context(|| "RemotePath conversion")?; + retry(|| ctx.client.delete(&path1, &cancel)).await?; + let t2_files = list_files(&ctx.client, &cancel).await?; + let t2 = time_point().await; + println!("at t2: {t2_files:?}"); - let orig = bytes::Bytes::from_static("remote blob data here".as_bytes()); - - let (data, len) = wrap_stream(orig.clone()); - - ctx.client.upload(data, len, &path, None).await?; - - // Normal download request - let dl = ctx.client.download(&path).await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig); - - // Full range (end specified) - let dl = ctx - .client - .download_byte_range(&path, 0, Some(len as u64)) - .await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig); - - // partial range (end specified) - let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig[4..10]); - - // partial range (end beyond real end) - let dl = ctx - .client - .download_byte_range(&path, 8, Some(len as u64 * 100)) - .await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig[8..]); - - // Partial range (end unspecified) - let dl = ctx.client.download_byte_range(&path, 4, None).await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig[4..]); - - // Full range (end unspecified) - let dl = ctx.client.download_byte_range(&path, 0, None).await?; - let buf = download_to_vec(dl).await?; - assert_eq!(&buf, &orig); - - debug!("Cleanup: deleting file at path {path:?}"); + // No changes after recovery to t2 (no-op) + let t_final = time_point().await; ctx.client - .delete(&path) - .await - .with_context(|| format!("{path:?} removal"))?; + .time_travel_recover(None, t2, t_final, &cancel) + .await?; + let t2_files_recovered = list_files(&ctx.client, &cancel).await?; + println!("after recovery to t2: {t2_files_recovered:?}"); + assert_eq!(t2_files, t2_files_recovered); + let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?; + assert_eq!(path2_recovered_t2, new_data.as_bytes()); + + // after recovery to t1: path1 is back, path2 has the old content + let t_final = time_point().await; + ctx.client + .time_travel_recover(None, t1, t_final, &cancel) + .await?; + let t1_files_recovered = list_files(&ctx.client, &cancel).await?; + println!("after recovery to t1: {t1_files_recovered:?}"); + assert_eq!(t1_files, t1_files_recovered); + let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?; + assert_eq!(path2_recovered_t1, old_data.as_bytes()); + + // after recovery to t0: everything is gone except for path1 + let t_final = time_point().await; + ctx.client + .time_travel_recover(None, t0, t_final, &cancel) + .await?; + let t0_files_recovered = list_files(&ctx.client, &cancel).await?; + println!("after recovery to t0: {t0_files_recovered:?}"); + assert_eq!(t0_files, t0_files_recovered); + + // cleanup + + let paths = &[path1, path2, path3]; + retry(|| ctx.client.delete_objects(paths, &cancel)).await?; Ok(()) } @@ -275,15 +205,24 @@ impl EnabledS3 { base_prefix: BASE_PREFIX, } } + + fn configure_request_timeout(&mut self, timeout: Duration) { + match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") { + GenericRemoteStorage::AwsS3(s3) => { + let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut"); + s3.timeout = timeout; + } + _ => unreachable!(), + } + } } -enum MaybeEnabledS3 { +enum MaybeEnabledStorage { Enabled(EnabledS3), Disabled, } -#[async_trait::async_trait] -impl AsyncTestContext for MaybeEnabledS3 { +impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -299,7 +238,7 @@ impl AsyncTestContext for MaybeEnabledS3 { } } -enum MaybeEnabledS3WithTestBlobs { +enum MaybeEnabledStorageWithTestBlobs { Enabled(S3WithTestBlobs), Disabled, UploadsFailed(anyhow::Error, S3WithTestBlobs), @@ -311,8 +250,7 @@ struct S3WithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] -impl AsyncTestContext for MaybeEnabledS3WithTestBlobs { +impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { @@ -359,11 +297,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs { } } -// NOTE: the setups for the list_prefixes test and the list_files test are very similar -// However, they are not idential. The list_prefixes function is concerned with listing prefixes, -// whereas the list_files function is concerned with listing files. -// See `RemoteStorage::list_files` documentation for more details -enum MaybeEnabledS3WithSimpleTestBlobs { +enum MaybeEnabledStorageWithSimpleTestBlobs { Enabled(S3WithSimpleTestBlobs), Disabled, UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs), @@ -373,8 +307,7 @@ struct S3WithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] -impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs { +impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { @@ -447,9 +380,176 @@ fn create_s3_client( endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, + upload_storage_class: None, }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, )) } + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return; + }; + + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new( + format!("{}/file_to_copy", ctx.base_prefix).as_str(), + )) + .unwrap(); + + let len = upload_large_enough_file(&ctx.client, &path, &cancel).await; + + let timeout = std::time::Duration::from_secs(5); + + ctx.configure_request_timeout(timeout); + + let started_at = std::time::Instant::now(); + let mut stream = ctx + .client + .download(&path, &cancel) + .await + .expect("download succeeds") + .download_stream; + + if started_at.elapsed().mul_f32(0.9) >= timeout { + tracing::warn!( + elapsed_ms = started_at.elapsed().as_millis(), + "timeout might be too low, consumed most of it during headers" + ); + } + + let first = stream + .next() + .await + .expect("should have the first blob") + .expect("should have succeeded"); + + tracing::info!(len = first.len(), "downloaded first chunk"); + + assert!( + first.len() < len, + "uploaded file is too small, we downloaded all on first chunk" + ); + + tokio::time::sleep(timeout).await; + + { + let started_at = std::time::Instant::now(); + let next = stream + .next() + .await + .expect("stream should not have ended yet"); + + tracing::info!( + next.is_err = next.is_err(), + elapsed_ms = started_at.elapsed().as_millis(), + "received item after timeout" + ); + + let e = next.expect_err("expected an error, but got a chunk?"); + + let inner = e.get_ref().expect("std::io::Error::inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Timeout)), + "{inner:?}" + ); + } + + ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT); + + ctx.client.delete_objects(&[path], &cancel).await.unwrap() +} + +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return; + }; + + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new( + format!("{}/file_to_copy", ctx.base_prefix).as_str(), + )) + .unwrap(); + + let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await; + + { + let stream = ctx + .client + .download(&path, &cancel) + .await + .expect("download succeeds") + .download_stream; + + let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream)); + + let first = reader.fill_buf().await.expect("should have the first blob"); + + let len = first.len(); + tracing::info!(len, "downloaded first chunk"); + + assert!( + first.len() < file_len, + "uploaded file is too small, we downloaded all on first chunk" + ); + + reader.consume(len); + + cancel.cancel(); + + let next = reader.fill_buf().await; + + let e = next.expect_err("expected an error, but got a chunk?"); + + let inner = e.get_ref().expect("std::io::Error::inner should be set"); + assert!( + inner + .downcast_ref::() + .is_some_and(|e| matches!(e, DownloadError::Cancelled)), + "{inner:?}" + ); + + let e = DownloadError::from(e); + + assert!(matches!(e, DownloadError::Cancelled), "{e:?}"); + } + + let cancel = CancellationToken::new(); + + ctx.client.delete_objects(&[path], &cancel).await.unwrap(); +} + +/// Upload a long enough file so that we cannot download it in single chunk +/// +/// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin +async fn upload_large_enough_file( + client: &GenericRemoteStorage, + path: &RemotePath, + cancel: &CancellationToken, +) -> usize { + let header = bytes::Bytes::from_static("remote blob data content".as_bytes()); + let body = bytes::Bytes::from(vec![0u8; 1024]); + let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128)); + + let len = contents.clone().fold(0, |acc, next| acc + next.len()); + + let contents = futures::stream::iter(contents.map(std::io::Result::Ok)); + + client + .upload(contents, len, path, None, cancel) + .await + .expect("upload succeeds"); + + len +} diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index ce5a1e411e..2fbc333075 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -50,6 +50,9 @@ pub struct SkTimelineInfo { pub safekeeper_connstr: Option, #[serde(default)] pub http_connstr: Option, + // Minimum of all active RO replicas flush LSN + #[serde(default = "lsn_invalid")] + pub standby_horizon: Lsn, } #[derive(Debug, Clone, Deserialize, Serialize)] diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs index 7660d41c56..0ffea0f2cd 100644 --- a/libs/tenant_size_model/tests/tests.rs +++ b/libs/tenant_size_model/tests/tests.rs @@ -247,7 +247,7 @@ fn scenario_4() { // // This is in total 5000 + 1000 + 5000 + 1000 = 12000 // - // (If we used the the method from the previous scenario, and + // (If we used the method from the previous scenario, and // kept only snapshot at the branch point, we'd need to keep // all the WAL between 10000-18000 on the main branch, so // the total size would be 5000 + 1000 + 8000 = 14000. The diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index b285c9b5b0..512a748124 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -7,7 +7,7 @@ license.workspace = true [dependencies] hyper.workspace = true opentelemetry = { workspace = true, features=["rt-tokio"] } -opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions.workspace = true reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 706b7a3187..a6a081c5c1 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -13,6 +13,7 @@ testing = ["fail/failpoints"] [dependencies] arc-swap.workspace = true sentry.workspace = true +async-compression.workspace = true async-trait.workspace = true anyhow.workspace = true bincode.workspace = true @@ -21,10 +22,12 @@ camino.workspace = true chrono.workspace = true heapless.workspace = true hex = { workspace = true, features = ["serde"] } +humantime.workspace = true hyper = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} jsonwebtoken.workspace = true +leaky-bucket.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true @@ -35,6 +38,7 @@ serde_json.workspace = true signal-hook.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-tar.workspace = true tokio-util.workspace = true tracing.workspace = true tracing-error.workspace = true @@ -45,6 +49,7 @@ strum.workspace = true strum_macros.workspace = true url.workspace = true uuid.workspace = true +walkdir.workspace = true pq_proto.workspace = true postgres_connection.workspace = true diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 98d839ca55..44eb36387c 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -1,5 +1,3 @@ -#![allow(unused)] - use criterion::{criterion_group, criterion_main, Criterion}; use utils::id; diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 66b1f6e866..03e65f74fe 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -1,7 +1,6 @@ // For details about authentication see docs/authentication.md use arc_swap::ArcSwap; -use serde; use std::{borrow::Cow, fmt::Display, fs, sync::Arc}; use anyhow::Result; @@ -29,6 +28,11 @@ pub enum Scope { // Should only be used e.g. for status check. // Currently also used for connection from any pageserver to any safekeeper. SafekeeperData, + // The scope used by pageservers in upcalls to storage controller and cloud control plane + #[serde(rename = "generations_api")] + GenerationsApi, + // Allows access to control plane managment API and some storage controller endpoints. + Admin, } /// JWT payload. See docs/authentication.md for the format @@ -127,6 +131,10 @@ impl JwtAuth { Ok(Self::new(decoding_keys)) } + pub fn from_key(key: String) -> Result { + Ok(Self::new(vec![DecodingKey::from_ed_pem(key.as_bytes())?])) + } + /// Attempt to decode the token with the internal decoding keys. /// /// The function tries the stored decoding keys in succession, @@ -197,12 +205,11 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH // "scope": "tenant", // "tenant_id": "3d1f7595b468230304e0b73cecbcb081", // "iss": "neon.controlplane", - // "exp": 1709200879, // "iat": 1678442479 // } // ``` // - let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw"; + let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJpYXQiOjE2Nzg0NDI0Nzl9.rNheBnluMJNgXzSTTJoTNIGy4P_qe0JUHl_nVEGuDCTgHOThPVr552EnmKccrCKquPeW3c2YUk0Y9Oh4KyASAw"; // Check it can be validated with the public key let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]); diff --git a/libs/utils/src/backoff.rs b/libs/utils/src/backoff.rs index d50ad39585..096c7e5854 100644 --- a/libs/utils/src/backoff.rs +++ b/libs/utils/src/backoff.rs @@ -37,69 +37,53 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec } } -/// Configure cancellation for a retried operation: when to cancel (the token), and -/// what kind of error to return on cancellation -pub struct Cancel -where - E: Display + Debug + 'static, - CF: Fn() -> E, -{ - token: CancellationToken, - on_cancel: CF, -} - -impl Cancel -where - E: Display + Debug + 'static, - CF: Fn() -> E, -{ - pub fn new(token: CancellationToken, on_cancel: CF) -> Self { - Self { token, on_cancel } - } -} - -/// retries passed operation until one of the following conditions are met: -/// Encountered error is considered as permanent (non-retryable) -/// Retries have been exhausted. -/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors -/// When attempts cross `warn_threshold` function starts to emit log warnings. +/// Retries passed operation until one of the following conditions are met: +/// - encountered error is considered as permanent (non-retryable) +/// - retries have been exhausted +/// - cancellation token has been cancelled +/// +/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent +/// errors. When attempts cross `warn_threshold` function starts to emit log warnings. /// `description` argument is added to log messages. Its value should identify the `op` is doing -/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken -/// to drop out promptly on shutdown. -pub async fn retry( +/// `cancel` cancels new attempts and the backoff sleep. +/// +/// If attempts fail, they are being logged with `{:#}` which works for anyhow, but does not work +/// for any other error type. Final failed attempt is logged with `{:?}`. +/// +/// Returns `None` if cancellation was noticed during backoff or the terminal result. +pub async fn retry( mut op: O, is_permanent: impl Fn(&E) -> bool, warn_threshold: u32, max_retries: u32, description: &str, - cancel: Cancel, -) -> Result + cancel: &CancellationToken, +) -> Option> where // Not std::error::Error because anyhow::Error doesnt implement it. // For context see https://github.com/dtolnay/anyhow/issues/63 E: Display + Debug + 'static, O: FnMut() -> F, F: Future>, - CF: Fn() -> E, { let mut attempts = 0; loop { - if cancel.token.is_cancelled() { - return Err((cancel.on_cancel)()); + if cancel.is_cancelled() { + return None; } let result = op().await; - match result { + match &result { Ok(_) => { if attempts > 0 { tracing::info!("{description} succeeded after {attempts} retries"); } - return result; + return Some(result); } // These are "permanent" errors that should not be retried. - Err(ref e) if is_permanent(e) => { - return result; + Err(e) if is_permanent(e) => { + return Some(result); } // Assume that any other failure might be transient, and the operation might // succeed if we just keep trying. @@ -109,12 +93,12 @@ where Err(err) if attempts < max_retries => { tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}"); } - Err(ref err) => { + Err(err) => { // Operation failed `max_attempts` times. Time to give up. tracing::warn!( "{description} still failed after {attempts} retries, giving up: {err:?}" ); - return result; + return Some(result); } } // sleep and retry @@ -122,7 +106,7 @@ where attempts, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, - &cancel.token, + cancel, ) .await; attempts += 1; @@ -131,11 +115,9 @@ where #[cfg(test)] mod tests { - use std::io; - - use tokio::sync::Mutex; - use super::*; + use std::io; + use tokio::sync::Mutex; #[test] fn backoff_defaults_produce_growing_backoff_sequence() { @@ -166,7 +148,7 @@ mod tests { #[tokio::test(start_paused = true)] async fn retry_always_error() { let count = Mutex::new(0); - let err_result = retry( + retry( || async { *count.lock().await += 1; Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other)) @@ -175,11 +157,11 @@ mod tests { 1, 1, "work", - Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }), + &CancellationToken::new(), ) - .await; - - assert!(err_result.is_err()); + .await + .expect("not cancelled") + .expect_err("it can only fail"); assert_eq!(*count.lock().await, 2); } @@ -201,10 +183,11 @@ mod tests { 2, 2, "work", - Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }), + &CancellationToken::new(), ) .await - .unwrap(); + .expect("not cancelled") + .expect("success on second try"); } #[tokio::test(start_paused = true)] @@ -224,10 +207,11 @@ mod tests { 2, 2, "work", - Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }), + &CancellationToken::new(), ) .await - .unwrap_err(); + .expect("was not cancellation") + .expect_err("it was permanent error"); assert_eq!(*count.lock().await, 1); } diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index ca6827c9b8..2fef8d35df 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -4,7 +4,9 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; /// /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] -pub struct Completion(TaskTrackerToken); +pub struct Completion { + _token: TaskTrackerToken, +} /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] @@ -27,6 +29,11 @@ impl Barrier { b.wait().await } } + + /// Return true if a call to wait() would complete immediately + pub fn is_ready(&self) -> bool { + futures::future::FutureExt::now_or_never(self.0.wait()).is_some() + } } impl PartialEq for Barrier { @@ -44,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) { tracker.close(); let token = tracker.token(); - (Completion(token), Barrier(tracker)) + (Completion { _token: token }, Barrier(tracker)) } diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index b089af4a02..756b19138c 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,7 +1,7 @@ use std::{ borrow::Cow, fs::{self, File}, - io, + io::{self, Write}, }; use camino::{Utf8Path, Utf8PathBuf}; @@ -112,6 +112,97 @@ pub async fn fsync_async(path: impl AsRef) -> Result<(), std::io::Erro tokio::fs::File::open(path.as_ref()).await?.sync_all().await } +pub async fn fsync_async_opt( + path: impl AsRef, + do_fsync: bool, +) -> Result<(), std::io::Error> { + if do_fsync { + fsync_async(path.as_ref()).await?; + } + Ok(()) +} + +/// Like postgres' durable_rename, renames file issuing fsyncs do make it +/// durable. After return, file and rename are guaranteed to be persisted. +/// +/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make +/// contents durable; 2) its directory entry to make rename durable 3) again to +/// already renamed file, which is not required by standards but postgres does +/// it, let's stick to that. Postgres additionally fsyncs newpath *before* +/// rename if it exists to ensure that at least one of the files survives, but +/// current callers don't need that. +/// +/// virtual_file.rs has similar code, but it doesn't use vfs. +/// +/// Useful links: +/// +/// +pub async fn durable_rename( + old_path: impl AsRef, + new_path: impl AsRef, + do_fsync: bool, +) -> io::Result<()> { + // first fsync the file + fsync_async_opt(old_path.as_ref(), do_fsync).await?; + + // Time to do the real deal. + tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?; + + // Postgres'ish fsync of renamed file. + fsync_async_opt(new_path.as_ref(), do_fsync).await?; + + // Now fsync the parent + let parent = match new_path.as_ref().parent() { + Some(p) => p, + None => Utf8Path::new("./"), // assume current dir if there is no parent + }; + fsync_async_opt(parent, do_fsync).await?; + + Ok(()) +} + +/// Writes a file to the specified `final_path` in a crash safe fasion, using [`std::fs`]. +/// +/// The file is first written to the specified `tmp_path`, and in a second +/// step, the `tmp_path` is renamed to the `final_path`. Intermediary fsync +/// and atomic rename guarantee that, if we crash at any point, there will never +/// be a partially written file at `final_path` (but maybe at `tmp_path`). +/// +/// Callers are responsible for serializing calls of this function for a given `final_path`. +/// If they don't, there may be an error due to conflicting `tmp_path`, or there will +/// be no error and the content of `final_path` will be the "winner" caller's `content`. +/// I.e., the atomticity guarantees still hold. +pub fn overwrite( + final_path: &Utf8Path, + tmp_path: &Utf8Path, + content: &[u8], +) -> std::io::Result<()> { + let Some(final_path_parent) = final_path.parent() else { + return Err(std::io::Error::from_raw_os_error( + nix::errno::Errno::EINVAL as i32, + )); + }; + std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?; + let mut file = std::fs::OpenOptions::new() + .write(true) + // Use `create_new` so that, if we race with ourselves or something else, + // we bail out instead of causing damage. + .create_new(true) + .open(tmp_path)?; + file.write_all(content)?; + file.sync_all()?; + drop(file); // don't keep the fd open for longer than we have to + + std::fs::rename(tmp_path, final_path)?; + + let final_parent_dirfd = std::fs::OpenOptions::new() + .read(true) + .open(final_path_parent)?; + + final_parent_dirfd.sync_all()?; + Ok(()) +} + #[cfg(test)] mod tests { diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs new file mode 100644 index 0000000000..b3e326bfd0 --- /dev/null +++ b/libs/utils/src/env.rs @@ -0,0 +1,21 @@ +//! Wrapper around `std::env::var` for parsing environment variables. + +use std::{fmt::Display, str::FromStr}; + +pub fn var(varname: &str) -> Option +where + V: FromStr, + E: Display, +{ + match std::env::var(varname) { + Ok(s) => Some( + s.parse() + .map_err(|e| format!("failed to parse env var {varname}: {e:#}")) + .unwrap(), + ), + Err(std::env::VarError::NotPresent) => None, + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {varname} is not unicode") + } + } +} diff --git a/libs/utils/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs index 5ec532e2a6..870684b399 100644 --- a/libs/utils/src/failpoint_support.rs +++ b/libs/utils/src/failpoint_support.rs @@ -9,12 +9,43 @@ use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::*; +/// Declare a failpoint that can use the `pause` failpoint action. +/// We don't want to block the executor thread, hence, spawn_blocking + await. +#[macro_export] +macro_rules! pausable_failpoint { + ($name:literal) => { + if cfg!(feature = "testing") { + tokio::task::spawn_blocking({ + let current = tracing::Span::current(); + move || { + let _entered = current.entered(); + tracing::info!("at failpoint {}", $name); + fail::fail_point!($name); + } + }) + .await + .expect("spawn_blocking"); + } + }; + ($name:literal, $cond:expr) => { + if cfg!(feature = "testing") { + if $cond { + pausable_failpoint!($name) + } + } + }; +} + /// use with fail::cfg("$name", "return(2000)") /// /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the /// specified time (in milliseconds). The main difference is that we use async /// tokio sleep function. Another difference is that we print lines to the log, /// which can be useful in tests to check that the failpoint was hit. +/// +/// Optionally pass a cancellation token, and this failpoint will drop out of +/// its sleep when the cancellation token fires. This is useful for testing +/// cases where we would like to block something, but test its clean shutdown behavior. #[macro_export] macro_rules! __failpoint_sleep_millis_async { ($name:literal) => {{ @@ -30,6 +61,24 @@ macro_rules! __failpoint_sleep_millis_async { $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await } }}; + ($name:literal, $cancel:expr) => {{ + // If the failpoint is used with a "return" action, set should_sleep to the + // returned value (as string). Otherwise it's set to None. + let should_sleep = (|| { + ::fail::fail_point!($name, |x| x); + ::std::option::Option::None + })(); + + // Sleep if the action was a returned value + if let ::std::option::Option::Some(duration_str) = should_sleep { + $crate::failpoint_support::failpoint_sleep_cancellable_helper( + $name, + duration_str, + $cancel, + ) + .await + } + }}; } pub use __failpoint_sleep_millis_async as sleep_millis_async; @@ -45,6 +94,22 @@ pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) { tracing::info!("failpoint {:?}: sleep done", name); } +// Helper function used by the macro. (A function has nicer scoping so we +// don't need to decorate everything with "::") +#[doc(hidden)] +pub async fn failpoint_sleep_cancellable_helper( + name: &'static str, + duration_str: String, + cancel: &CancellationToken, +) { + let millis = duration_str.parse::().unwrap(); + let d = std::time::Duration::from_millis(millis); + + tracing::info!("failpoint {:?}: sleeping for {:?}", name, d); + tokio::time::timeout(d, cancel.cancelled()).await.ok(); + tracing::info!("failpoint {:?}: sleep done", name); +} + pub fn init() -> fail::FailScenario<'static> { // The failpoints lib provides support for parsing the `FAILPOINTS` env var. // We want non-default behavior for `exit`, though, so, we handle it separately. diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index 90ba348a02..8e53d2c79b 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -3,6 +3,9 @@ use std::{fs, io, path::Path}; use anyhow::Context; +mod rename_noreplace; +pub use rename_noreplace::rename_noreplace; + pub trait PathExt { /// Returns an error if `self` is not a directory. fn is_empty_dir(&self) -> io::Result; diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs new file mode 100644 index 0000000000..897e30d7f1 --- /dev/null +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -0,0 +1,109 @@ +use nix::NixPath; + +/// Rename a file without replacing an existing file. +/// +/// This is a wrapper around platform-specific APIs. +pub fn rename_noreplace( + src: &P1, + dst: &P2, +) -> nix::Result<()> { + { + #[cfg(target_os = "linux")] + { + nix::fcntl::renameat2( + None, + src, + None, + dst, + nix::fcntl::RenameFlags::RENAME_NOREPLACE, + ) + } + #[cfg(target_os = "macos")] + { + let res = src.with_nix_path(|src| { + dst.with_nix_path(|dst| + // SAFETY: `src` and `dst` are valid C strings as per the NixPath trait and they outlive the call to renamex_np. + unsafe { + nix::libc::renamex_np(src.as_ptr(), dst.as_ptr(), nix::libc::RENAME_EXCL) + }) + })??; + nix::errno::Errno::result(res).map(drop) + } + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + std::compile_error!("OS does not support no-replace renames"); + } + } +} + +#[cfg(test)] +mod test { + use std::{fs, path::PathBuf}; + + use super::*; + + fn testdir() -> camino_tempfile::Utf8TempDir { + match crate::env::var("NEON_UTILS_RENAME_NOREPLACE_TESTDIR") { + Some(path) => { + let path: camino::Utf8PathBuf = path; + camino_tempfile::tempdir_in(path).unwrap() + } + None => camino_tempfile::tempdir().unwrap(), + } + } + + #[test] + fn test_absolute_paths() { + let testdir = testdir(); + println!("testdir: {}", testdir.path()); + + let src = testdir.path().join("src"); + let dst = testdir.path().join("dst"); + + fs::write(&src, b"").unwrap(); + fs::write(&dst, b"").unwrap(); + + let src = src.canonicalize().unwrap(); + assert!(src.is_absolute()); + let dst = dst.canonicalize().unwrap(); + assert!(dst.is_absolute()); + + let result = rename_noreplace(&src, &dst); + assert_eq!(result.unwrap_err(), nix::Error::EEXIST); + } + + #[test] + fn test_relative_paths() { + let testdir = testdir(); + println!("testdir: {}", testdir.path()); + + // this is fine because we run in nextest => process per test + std::env::set_current_dir(testdir.path()).unwrap(); + + let src = PathBuf::from("src"); + let dst = PathBuf::from("dst"); + + fs::write(&src, b"").unwrap(); + fs::write(&dst, b"").unwrap(); + + let result = rename_noreplace(&src, &dst); + assert_eq!(result.unwrap_err(), nix::Error::EEXIST); + } + + #[test] + fn test_works_when_not_exists() { + let testdir = testdir(); + println!("testdir: {}", testdir.path()); + + let src = testdir.path().join("src"); + let dst = testdir.path().join("dst"); + + fs::write(&src, b"content").unwrap(); + + rename_noreplace(src.as_std_path(), dst.as_std_path()).unwrap(); + assert_eq!( + "content", + String::from_utf8(std::fs::read(&dst).unwrap()).unwrap() + ); + } +} diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index 46eadee1da..b703e883de 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -34,6 +34,8 @@ pub enum Generation { /// scenarios where pageservers might otherwise issue conflicting writes to /// remote storage impl Generation { + pub const MAX: Self = Self::Valid(u32::MAX); + /// Create a new Generation that represents a legacy key format with /// no generation suffix pub fn none() -> Self { @@ -45,7 +47,7 @@ impl Generation { Self::Broken } - pub fn new(v: u32) -> Self { + pub const fn new(v: u32) -> Self { Self::Valid(v) } @@ -54,12 +56,10 @@ impl Generation { } #[track_caller] - pub fn get_suffix(&self) -> String { + pub fn get_suffix(&self) -> impl std::fmt::Display { match self { - Self::Valid(v) => { - format!("-{:08x}", v) - } - Self::None => "".into(), + Self::Valid(v) => GenerationFileSuffix(Some(*v)), + Self::None => GenerationFileSuffix(None), Self::Broken => { panic!("Tried to use a broken generation"); } @@ -90,6 +90,7 @@ impl Generation { } } + #[track_caller] pub fn next(&self) -> Generation { match self { Self::Valid(n) => Self::Valid(*n + 1), @@ -107,6 +108,18 @@ impl Generation { } } +struct GenerationFileSuffix(Option); + +impl std::fmt::Display for GenerationFileSuffix { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(g) = self.0 { + write!(f, "-{g:08x}") + } else { + Ok(()) + } + } +} + impl Serialize for Generation { fn serialize(&self, serializer: S) -> Result where @@ -164,4 +177,24 @@ mod test { assert!(Generation::none() < Generation::new(0)); assert!(Generation::none() < Generation::new(1)); } + + #[test] + fn suffix_is_stable() { + use std::fmt::Write as _; + + // the suffix must remain stable through-out the pageserver remote storage evolution and + // not be changed accidentially without thinking about migration + let examples = [ + (line!(), Generation::None, ""), + (line!(), Generation::Valid(0), "-00000000"), + (line!(), Generation::Valid(u32::MAX), "-ffffffff"), + ]; + + let mut s = String::new(); + for (line, gen, expected) in examples { + s.clear(); + write!(s, "{}", &gen.get_suffix()).expect("string grows"); + assert_eq!(s, expected, "example on {line}"); + } + } } diff --git a/libs/utils/src/hex.rs b/libs/utils/src/hex.rs index fc0bb7e4a2..382f805a96 100644 --- a/libs/utils/src/hex.rs +++ b/libs/utils/src/hex.rs @@ -19,13 +19,13 @@ /// // right: [0x68; 1] /// # fn serialize_something() -> Vec { "hello world".as_bytes().to_vec() } /// ``` -#[derive(PartialEq)] -pub struct Hex<'a>(pub &'a [u8]); +pub struct Hex(pub S); -impl std::fmt::Debug for Hex<'_> { +impl> std::fmt::Debug for Hex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[")?; - for (i, c) in self.0.chunks(16).enumerate() { + let chunks = self.0.as_ref().chunks(16); + for (i, c) in chunks.enumerate() { if i > 0 && !c.is_empty() { writeln!(f, ", ")?; } @@ -36,6 +36,15 @@ impl std::fmt::Debug for Hex<'_> { write!(f, "0x{b:02x}")?; } } - write!(f, "; {}]", self.0.len()) + write!(f, "; {}]", self.0.as_ref().len()) + } +} + +impl, L: AsRef<[u8]>> PartialEq> for Hex { + fn eq(&self, other: &Hex) -> bool { + let left = self.0.as_ref(); + let right = other.0.as_ref(); + + left == right } } diff --git a/libs/utils/src/history_buffer.rs b/libs/utils/src/history_buffer.rs index 1f07f5560f..bd35e2bad6 100644 --- a/libs/utils/src/history_buffer.rs +++ b/libs/utils/src/history_buffer.rs @@ -47,9 +47,10 @@ impl ops::Deref for HistoryBufferWithDropCounter { } } -#[derive(serde::Serialize)] +#[derive(serde::Serialize, serde::Deserialize)] struct SerdeRepr { buffer: Vec, + buffer_size: usize, drop_count: u64, } @@ -61,6 +62,7 @@ where let HistoryBufferWithDropCounter { buffer, drop_count } = value; SerdeRepr { buffer: buffer.iter().cloned().collect(), + buffer_size: L, drop_count: *drop_count, } } @@ -78,19 +80,52 @@ where } } +impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter +where + T: Clone + serde::Deserialize<'de>, +{ + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let SerdeRepr { + buffer: des_buffer, + drop_count, + buffer_size, + } = SerdeRepr::::deserialize(deserializer)?; + if buffer_size != L { + use serde::de::Error; + return Err(D::Error::custom(format!( + "invalid buffer_size, expecting {L} got {buffer_size}" + ))); + } + let mut buffer = HistoryBuffer::new(); + buffer.extend(des_buffer); + Ok(HistoryBufferWithDropCounter { buffer, drop_count }) + } +} + #[cfg(test)] mod test { use super::HistoryBufferWithDropCounter; #[test] fn test_basics() { - let mut b = HistoryBufferWithDropCounter::<_, 2>::default(); + let mut b = HistoryBufferWithDropCounter::::default(); b.write(1); b.write(2); b.write(3); assert!(b.iter().any(|e| *e == 2)); assert!(b.iter().any(|e| *e == 3)); assert!(!b.iter().any(|e| *e == 1)); + + // round-trip serde + let round_tripped: HistoryBufferWithDropCounter = + serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap(); + assert_eq!( + round_tripped.iter().cloned().collect::>(), + b.iter().cloned().collect::>() + ); } #[test] diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 550ab10700..f8a5f68131 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; -use tracing::{self, debug, info, info_span, warn, Instrument}; +use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; use std::str::FromStr; @@ -156,6 +156,10 @@ pub struct ChannelWriter { buffer: BytesMut, pub tx: mpsc::Sender>, written: usize, + /// Time spent waiting for the channel to make progress. It is not the same as time to upload a + /// buffer because we cannot know anything about that, but this should allow us to understand + /// the actual time taken without the time spent `std::thread::park`ed. + wait_time: std::time::Duration, } impl ChannelWriter { @@ -168,6 +172,7 @@ impl ChannelWriter { buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2), tx, written: 0, + wait_time: std::time::Duration::ZERO, } } @@ -180,6 +185,8 @@ impl ChannelWriter { tracing::trace!(n, "flushing"); let ready = self.buffer.split().freeze(); + let wait_started_at = std::time::Instant::now(); + // not ideal to call from blocking code to block_on, but we are sure that this // operation does not spawn_blocking other tasks let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async { @@ -192,6 +199,9 @@ impl ChannelWriter { // sending it to the client. Ok(()) }); + + self.wait_time += wait_started_at.elapsed(); + if res.is_err() { return Err(std::io::ErrorKind::BrokenPipe.into()); } @@ -202,6 +212,10 @@ impl ChannelWriter { pub fn flushed_bytes(&self) -> usize { self.written } + + pub fn wait_time(&self) -> std::time::Duration { + self.wait_time + } } impl std::io::Write for ChannelWriter { @@ -231,7 +245,7 @@ impl std::io::Write for ChannelWriter { } } -async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { +pub async fn prometheus_metrics_handler(_req: Request) -> Result, ApiError> { SERVE_METRICS_COUNT.inc(); let started_at = std::time::Instant::now(); @@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request) -> Result { tracing::info!( bytes = writer.flushed_bytes(), - elapsed_ms = started_at.elapsed().as_millis(), + total_ms = total.as_millis(), + spawning_ms = spawned_in.as_millis(), + collection_ms = collected_in.as_millis(), + encoding_ms = encoded_in.as_millis(), "responded /metrics" ); } Err(e) => { - tracing::warn!("failed to write out /metrics response: {e:#}"); + // there is a chance that this error is not the BrokenPipe we generate in the writer + // for "closed connection", but it is highly unlikely. + tracing::warn!( + after_bytes = writer.flushed_bytes(), + total_ms = total.as_millis(), + spawning_ms = spawned_in.as_millis(), + collection_ms = collected_in.as_millis(), + encoding_ms = encoded_in.as_millis(), + "failed to write out /metrics response: {e:?}" + ); // semantics of this error are quite... unclear. we want to error the stream out to // abort the response to somehow notify the client that we failed. // @@ -323,7 +367,6 @@ pub fn make_router() -> RouterBuilder { .middleware(Middleware::post_with_info( add_request_id_header_to_response, )) - .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .err_handler(route_error_handler) } diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 3e9281ac81..3d863a6518 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -34,6 +34,9 @@ pub enum ApiError { #[error("Timeout")] Timeout(Cow<'static, str>), + #[error("Request cancelled")] + Cancelled, + #[error(transparent)] InternalServerError(anyhow::Error), } @@ -74,6 +77,10 @@ impl ApiError { err.to_string(), StatusCode::REQUEST_TIMEOUT, ), + ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status( + self.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::INTERNAL_SERVER_ERROR, @@ -131,7 +138,10 @@ pub fn api_error_handler(api_error: ApiError) -> Response { ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"), ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"), - _ => error!("Error processing HTTP request: {api_error:#}"), + ApiError::ShuttingDown => info!("Shut down while processing HTTP request"), + ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"), + ApiError::Cancelled => info!("Request cancelled while processing HTTP request"), + _ => info!("Error processing HTTP request: {api_error:#}"), } api_error.into_response() diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 57dcc27719..0409001f4f 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -1,3 +1,4 @@ +use std::num::ParseIntError; use std::{fmt, str::FromStr}; use anyhow::Context; @@ -374,6 +375,13 @@ impl fmt::Display for NodeId { } } +impl FromStr for NodeId { + type Err = ParseIntError; + fn from_str(s: &str) -> Result { + Ok(NodeId(u64::from_str(s)?)) + } +} + #[cfg(test)] mod tests { use serde_assert::{Deserializer, Serializer, Token, Tokens}; diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 890061dc59..2953f0aad4 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -63,6 +63,7 @@ pub mod measured_stream; pub mod serde_percent; pub mod serde_regex; +pub mod serde_system_time; pub mod pageserver_feedback; @@ -87,6 +88,12 @@ pub mod failpoint_support; pub mod yielding_loop; +pub mod zstd; + +pub mod env; + +pub mod poison; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 987b9d9ad2..59c66ca757 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -63,6 +63,7 @@ impl UnwrittenLockFile { pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result { let lock_file = fs::OpenOptions::new() .create(true) // O_CREAT + .truncate(true) .write(true) .open(lock_file_path) .context("open lock file")?; diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index b3269ae049..1aebe91428 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -415,7 +415,6 @@ mod tests { use super::*; - use serde::ser::Serialize; use serde_assert::{Deserializer, Serializer, Token, Tokens}; #[test] diff --git a/libs/utils/src/nonblock.rs b/libs/utils/src/nonblock.rs index 8b1fd71ae6..05e2e3af4c 100644 --- a/libs/utils/src/nonblock.rs +++ b/libs/utils/src/nonblock.rs @@ -5,10 +5,10 @@ use std::os::unix::io::RawFd; pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> { let bits = fcntl(fd, F_GETFL)?; - // Safety: If F_GETFL returns some unknown bits, they should be valid + // If F_GETFL returns some unknown bits, they should be valid // for passing back to F_SETFL, too. If we left them out, the F_SETFL // would effectively clear them, which is not what we want. - let mut flags = unsafe { OFlag::from_bits_unchecked(bits) }; + let mut flags = OFlag::from_bits_retain(bits); flags |= OFlag::O_NONBLOCK; fcntl(fd, F_SETFL(flags))?; diff --git a/libs/utils/src/pageserver_feedback.rs b/libs/utils/src/pageserver_feedback.rs index c9fbdde928..3ddfa44f41 100644 --- a/libs/utils/src/pageserver_feedback.rs +++ b/libs/utils/src/pageserver_feedback.rs @@ -29,12 +29,10 @@ pub struct PageserverFeedback { // Serialize with RFC3339 format. #[serde(with = "serde_systemtime")] pub replytime: SystemTime, + /// Used to track feedbacks from different shards. Always zero for unsharded tenants. + pub shard_number: u32, } -// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. -// Do not remove previously available fields because this might be backwards incompatible. -pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; - impl PageserverFeedback { pub fn empty() -> PageserverFeedback { PageserverFeedback { @@ -43,6 +41,7 @@ impl PageserverFeedback { remote_consistent_lsn: Lsn::INVALID, disk_consistent_lsn: Lsn::INVALID, replytime: *PG_EPOCH, + shard_number: 0, } } @@ -59,17 +58,26 @@ impl PageserverFeedback { // // TODO: change serialized fields names once all computes migrate to rename. pub fn serialize(&self, buf: &mut BytesMut) { - buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys + let buf_ptr = buf.len(); + buf.put_u8(0); // # of keys, will be filled later + let mut nkeys = 0; + + nkeys += 1; buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); + nkeys += 1; buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); buf.put_u64(self.last_received_lsn.0); + + nkeys += 1; buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); buf.put_u64(self.disk_consistent_lsn.0); + + nkeys += 1; buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); buf.put_u64(self.remote_consistent_lsn.0); @@ -80,9 +88,19 @@ impl PageserverFeedback { .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; + nkeys += 1; buf.put_slice(b"ps_replytime\0"); buf.put_i32(8); buf.put_i64(timestamp); + + if self.shard_number > 0 { + nkeys += 1; + buf.put_slice(b"shard_number\0"); + buf.put_i32(4); + buf.put_u32(self.shard_number); + } + + buf[buf_ptr] = nkeys; } // Deserialize PageserverFeedback message @@ -123,6 +141,11 @@ impl PageserverFeedback { rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } + b"shard_number" => { + let len = buf.get_i32(); + assert_eq!(len, 4); + rf.shard_number = buf.get_u32(); + } _ => { let len = buf.get_i32(); warn!( @@ -194,10 +217,7 @@ mod tests { rf.serialize(&mut data); // Add an extra field to the buffer and adjust number of keys - if let Some(first) = data.first_mut() { - *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; - } - + data[0] += 1; data.put_slice(b"new_field_one\0"); data.put_i32(8); data.put_u64(42); diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs new file mode 100644 index 0000000000..27378c69fc --- /dev/null +++ b/libs/utils/src/poison.rs @@ -0,0 +1,121 @@ +//! Protect a piece of state from reuse after it is left in an inconsistent state. +//! +//! # Example +//! +//! ``` +//! # tokio::runtime::Builder::new_current_thread().enable_all().build().unwrap().block_on(async { +//! use utils::poison::Poison; +//! use std::time::Duration; +//! +//! struct State { +//! clean: bool, +//! } +//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true })); +//! +//! let mut mutex_guard = state.lock().await; +//! let mut poison_guard = mutex_guard.check_and_arm()?; +//! let state = poison_guard.data_mut(); +//! state.clean = false; +//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail. +//! tokio::time::sleep(Duration::from_secs(10)).await; +//! state.clean = true; +//! poison_guard.disarm(); +//! # Ok::<(), utils::poison::Error>(()) +//! # }); +//! ``` + +use tracing::warn; + +pub struct Poison { + what: &'static str, + state: State, + data: T, +} + +#[derive(Clone, Copy)] +enum State { + Clean, + Armed, + Poisoned { at: chrono::DateTime }, +} + +impl Poison { + /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed. + pub fn new(what: &'static str, data: T) -> Self { + Self { + what, + state: State::Clean, + data, + } + } + + /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state. + pub fn check_and_arm(&mut self) -> Result, Error> { + match self.state { + State::Clean => { + self.state = State::Armed; + Ok(Guard(self)) + } + State::Armed => unreachable!("transient state"), + State::Poisoned { at } => Err(Error::Poisoned { + what: self.what, + at, + }), + } + } +} + +/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state. +/// Once modifications are done, use [`Self::disarm`]. +/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned +/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error. +pub struct Guard<'a, T>(&'a mut Poison); + +impl<'a, T> Guard<'a, T> { + pub fn data(&self) -> &T { + &self.0.data + } + pub fn data_mut(&mut self) -> &mut T { + &mut self.0.data + } + + pub fn disarm(self) { + match self.0.state { + State::Clean => unreachable!("we set it to Armed in check_and_arm()"), + State::Armed => { + self.0.state = State::Clean; + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +impl<'a, T> Drop for Guard<'a, T> { + fn drop(&mut self) { + match self.0.state { + State::Clean => { + // set by disarm() + } + State::Armed => { + // still armed => poison it + let at = chrono::Utc::now(); + self.0.state = State::Poisoned { at }; + warn!(at=?at, "poisoning {}", self.0.what); + } + State::Poisoned { at } => { + unreachable!("we fail check_and_arm() if it's in that state: {at}") + } + } + } +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("poisoned at {at}: {what}")] + Poisoned { + what: &'static str, + at: chrono::DateTime, + }, +} diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index effc9c67b5..375b227b99 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -1,12 +1,11 @@ #![warn(missing_docs)] -use std::cmp::{Eq, Ordering, PartialOrd}; +use std::cmp::{Eq, Ordering}; use std::collections::BinaryHeap; -use std::fmt::Debug; use std::mem; use std::sync::Mutex; use std::time::Duration; -use tokio::sync::watch::{channel, Receiver, Sender}; +use tokio::sync::watch::{self, channel}; use tokio::time::timeout; /// An error happened while waiting for a number @@ -35,23 +34,73 @@ pub trait MonotonicCounter { fn cnt_value(&self) -> V; } -/// Internal components of a `SeqWait` -struct SeqWaitInt +/// Heap of waiters, lowest numbers pop first. +struct Waiters where - S: MonotonicCounter, V: Ord, { - waiters: BinaryHeap>, - current: S, - shutdown: bool, + heap: BinaryHeap>, + /// Number of the first waiter in the heap, or None if there are no waiters. + status_channel: watch::Sender>, +} + +impl Waiters +where + V: Ord + Copy, +{ + fn new() -> Self { + Waiters { + heap: BinaryHeap::new(), + status_channel: channel(None).0, + } + } + + /// `status_channel` contains the number of the first waiter in the heap. + /// This function should be called whenever waiters heap changes. + fn update_status(&self) { + let first_waiter = self.heap.peek().map(|w| w.wake_num); + let _ = self.status_channel.send_replace(first_waiter); + } + + /// Add new waiter to the heap, return a channel that will be notified when the number arrives. + fn add(&mut self, num: V) -> watch::Receiver<()> { + let (tx, rx) = channel(()); + self.heap.push(Waiter { + wake_num: num, + wake_channel: tx, + }); + self.update_status(); + rx + } + + /// Pop all waiters <= num from the heap. Collect channels in a vector, + /// so that caller can wake them up. + fn pop_leq(&mut self, num: V) -> Vec> { + let mut wake_these = Vec::new(); + while let Some(n) = self.heap.peek() { + if n.wake_num > num { + break; + } + wake_these.push(self.heap.pop().unwrap().wake_channel); + } + self.update_status(); + wake_these + } + + /// Used on shutdown to efficiently drop all waiters. + fn take_all(&mut self) -> BinaryHeap> { + let heap = mem::take(&mut self.heap); + self.update_status(); + heap + } } struct Waiter where T: Ord, { - wake_num: T, // wake me when this number arrives ... - wake_channel: Sender<()>, // ... by sending a message to this channel + wake_num: T, // wake me when this number arrives ... + wake_channel: watch::Sender<()>, // ... by sending a message to this channel } // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here @@ -76,6 +125,17 @@ impl PartialEq for Waiter { impl Eq for Waiter {} +/// Internal components of a `SeqWait` +struct SeqWaitInt +where + S: MonotonicCounter, + V: Ord, +{ + waiters: Waiters, + current: S, + shutdown: bool, +} + /// A tool for waiting on a sequence number /// /// This provides a way to wait the arrival of a number. @@ -108,7 +168,7 @@ where /// Create a new `SeqWait`, initialized to a particular number pub fn new(starting_num: S) -> Self { let internal = SeqWaitInt { - waiters: BinaryHeap::new(), + waiters: Waiters::new(), current: starting_num, shutdown: false, }; @@ -128,9 +188,8 @@ where // Block any future waiters from starting internal.shutdown = true; - // This will steal the entire waiters map. - // When we drop it all waiters will be woken. - mem::take(&mut internal.waiters) + // Take all waiters to drop them later. + internal.waiters.take_all() // Drop the lock as we exit this scope. }; @@ -182,9 +241,21 @@ where } } + /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`. + pub fn would_wait_for(&self, num: V) -> Result<(), V> { + let internal = self.internal.lock().unwrap(); + let cnt = internal.current.cnt_value(); + drop(internal); + if cnt >= num { + Ok(()) + } else { + Err(cnt) + } + } + /// Register and return a channel that will be notified when a number arrives, /// or None, if it has already arrived. - fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { + fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { let mut internal = self.internal.lock().unwrap(); if internal.current.cnt_value() >= num { return Ok(None); @@ -193,12 +264,8 @@ where return Err(SeqWaitError::Shutdown); } - // Create a new channel. - let (tx, rx) = channel(()); - internal.waiters.push(Waiter { - wake_num: num, - wake_channel: tx, - }); + // Add waiter channel to the queue. + let rx = internal.waiters.add(num); // Drop the lock as we exit this scope. Ok(Some(rx)) } @@ -219,16 +286,8 @@ where } internal.current.cnt_advance(num); - // Pop all waiters <= num from the heap. Collect them in a vector, and - // wake them up after releasing the lock. - let mut wake_these = Vec::new(); - while let Some(n) = internal.waiters.peek() { - if n.wake_num > num { - break; - } - wake_these.push(internal.waiters.pop().unwrap().wake_channel); - } - wake_these + // Pop all waiters <= num from the heap. + internal.waiters.pop_leq(num) }; for tx in wake_these { @@ -243,13 +302,29 @@ where pub fn load(&self) -> S { self.internal.lock().unwrap().current } + + /// Get a Receiver for the current status. + /// + /// The current status is the number of the first waiter in the queue, + /// or None if there are no waiters. + /// + /// This receiver will be notified whenever the status changes. + /// It is useful for receiving notifications when the first waiter + /// starts waiting for a number, or when there are no more waiters left. + pub fn status_receiver(&self) -> watch::Receiver> { + self.internal + .lock() + .unwrap() + .waiters + .status_channel + .subscribe() + } } #[cfg(test)] mod tests { use super::*; use std::sync::Arc; - use std::time::Duration; impl MonotonicCounter for i32 { fn cnt_advance(&mut self, val: i32) { diff --git a/libs/utils/src/serde_system_time.rs b/libs/utils/src/serde_system_time.rs new file mode 100644 index 0000000000..b0f6934e87 --- /dev/null +++ b/libs/utils/src/serde_system_time.rs @@ -0,0 +1,55 @@ +//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision. + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct SystemTime( + #[serde( + deserialize_with = "deser_rfc3339_millis", + serialize_with = "ser_rfc3339_millis" + )] + pub std::time::SystemTime, +); + +fn ser_rfc3339_millis( + ts: &std::time::SystemTime, + serializer: S, +) -> Result { + serializer.collect_str(&humantime::format_rfc3339_millis(*ts)) +} + +fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds. + fn to_millisecond_precision(time: SystemTime) -> SystemTime { + match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) { + Ok(duration) => { + let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis()); + SystemTime( + std::time::SystemTime::UNIX_EPOCH + + std::time::Duration::from_millis(total_millis), + ) + } + Err(_) => time, + } + } + + #[test] + fn test_serialize_deserialize() { + let input = SystemTime(std::time::SystemTime::now()); + let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0)); + let serialized = serde_json::to_string(&input).unwrap(); + assert_eq!(expected_serialized, serialized); + let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap(); + assert_eq!(to_millisecond_precision(input), deserialized); + } +} diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs index dc4a599111..ecc5353be3 100644 --- a/libs/utils/src/simple_rcu.rs +++ b/libs/utils/src/simple_rcu.rs @@ -221,7 +221,7 @@ impl RcuWaitList { #[cfg(test)] mod tests { use super::*; - use std::sync::{Arc, Mutex}; + use std::sync::Mutex; use std::time::Duration; #[tokio::test] diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index abc3842da8..156b99a010 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -1,4 +1,10 @@ -use std::{sync::Arc, time::Duration}; +use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; /// Gates are a concurrency helper, primarily used for implementing safe shutdown. /// @@ -6,62 +12,70 @@ use std::{sync::Arc, time::Duration}; /// the resource calls `close()` when they want to ensure that all holders of guards /// have released them, and that no future guards will be issued. pub struct Gate { - /// Each caller of enter() takes one unit from the semaphore. In close(), we - /// take all the units to ensure all GateGuards are destroyed. - sem: Arc, - - /// For observability only: a name that will be used to log warnings if a particular - /// gate is holding up shutdown - name: String, + inner: Arc, } impl std::fmt::Debug for Gate { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Gate<{}>", self.name) + f.debug_struct("Gate") + // use this for identification + .field("ptr", &Arc::as_ptr(&self.inner)) + .field("inner", &self.inner) + .finish() + } +} + +struct GateInner { + sem: tokio::sync::Semaphore, + closing: std::sync::atomic::AtomicBool, +} + +impl std::fmt::Debug for GateInner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let avail = self.sem.available_permits(); + + let guards = u32::try_from(avail) + .ok() + // the sem only supports 32-bit ish amount, but lets play it safe + .and_then(|x| Gate::MAX_UNITS.checked_sub(x)); + + let closing = self.closing.load(Ordering::Relaxed); + + if let Some(guards) = guards { + f.debug_struct("Gate") + .field("remaining_guards", &guards) + .field("closing", &closing) + .finish() + } else { + f.debug_struct("Gate") + .field("avail_permits", &avail) + .field("closing", &closing) + .finish() + } } } /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will /// not complete. #[derive(Debug)] -pub struct GateGuard(tokio::sync::OwnedSemaphorePermit); +pub struct GateGuard { + // Record the span where the gate was entered, so that we can identify who was blocking Gate::close + span_at_enter: tracing::Span, + gate: Arc, +} -/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate -async fn warn_if_stuck( - fut: Fut, - name: &str, - warn_period: std::time::Duration, -) -> ::Output { - let started = std::time::Instant::now(); - - let mut fut = std::pin::pin!(fut); - - let mut warned = false; - let ret = loop { - match tokio::time::timeout(warn_period, &mut fut).await { - Ok(ret) => break ret, - Err(_) => { - tracing::warn!( - gate = name, - elapsed_ms = started.elapsed().as_millis(), - "still waiting, taking longer than expected..." - ); - warned = true; - } +impl Drop for GateGuard { + fn drop(&mut self) { + if self.gate.closing.load(Ordering::Relaxed) { + self.span_at_enter.in_scope( + || tracing::info!(gate = ?Arc::as_ptr(&self.gate), "kept the gate from closing"), + ); } - }; - // If we emitted a warning for slowness, also emit a message when we complete, so that - // someone debugging a shutdown can know for sure whether we have moved past this operation. - if warned { - tracing::info!( - gate = name, - elapsed_ms = started.elapsed().as_millis(), - "completed, after taking longer than expected" - ) + // when the permit was acquired, it was forgotten to allow us to manage it's lifecycle + // manually, so "return" the permit now. + self.gate.sem.add_permits(1); } - - ret } #[derive(Debug)] @@ -69,15 +83,19 @@ pub enum GateError { GateClosed, } -impl Gate { - const MAX_UNITS: u32 = u32::MAX; - - pub fn new(name: String) -> Self { +impl Default for Gate { + fn default() -> Self { Self { - sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)), - name, + inner: Arc::new(GateInner { + sem: tokio::sync::Semaphore::new(Self::MAX_UNITS as usize), + closing: AtomicBool::new(false), + }), } } +} + +impl Gate { + const MAX_UNITS: u32 = u32::MAX; /// Acquire a guard that will prevent close() calls from completing. If close() /// was already called, this will return an error which should be interpreted @@ -88,11 +106,23 @@ impl Gate { /// to avoid blocking close() indefinitely: typically types that contain a Gate will /// also contain a CancellationToken. pub fn enter(&self) -> Result { - self.sem - .clone() - .try_acquire_owned() - .map(GateGuard) - .map_err(|_| GateError::GateClosed) + let permit = self + .inner + .sem + .try_acquire() + .map_err(|_| GateError::GateClosed)?; + + // we now have the permit, let's disable the normal raii functionality and leave + // "returning" the permit to our GateGuard::drop. + // + // this is done to avoid the need for multiple Arcs (one for semaphore, next for other + // fields). + permit.forget(); + + Ok(GateGuard { + span_at_enter: tracing::Span::current(), + gate: self.inner.clone(), + }) } /// Types with a shutdown() method and a gate should call this method at the @@ -102,48 +132,89 @@ impl Gate { /// important that the holders of such guards are respecting a CancellationToken which has /// been cancelled before entering this function. pub async fn close(&self) { - warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await + let started_at = std::time::Instant::now(); + let mut do_close = std::pin::pin!(self.do_close()); + + // with 1s we rarely saw anything, let's try if we get more gate closing reasons with 100ms + let nag_after = Duration::from_millis(100); + + let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else { + return; + }; + + tracing::info!( + gate = ?self.as_ptr(), + elapsed_ms = started_at.elapsed().as_millis(), + "closing is taking longer than expected" + ); + + // close operation is not trying to be cancellation safe as pageserver does not need it. + // + // note: "closing" is not checked in Gate::enter -- it exists just for observability, + // dropping of GateGuard after this will log who they were. + self.inner.closing.store(true, Ordering::Relaxed); + + do_close.await; + + tracing::info!( + gate = ?self.as_ptr(), + elapsed_ms = started_at.elapsed().as_millis(), + "close completed" + ); + } + + /// Used as an identity of a gate. This identity will be resolved to something useful when + /// it's actually closed in a hopefully sensible `tracing::Span` which will describe it even + /// more. + /// + /// `GateGuard::drop` also logs this pointer when it has realized it has been keeping the gate + /// open for too long. + fn as_ptr(&self) -> *const GateInner { + Arc::as_ptr(&self.inner) } /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish. This /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking /// the CancellationToken on such types is analogous to "Did shutdown start?" pub fn close_complete(&self) -> bool { - self.sem.is_closed() + self.inner.sem.is_closed() } + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(gate = ?self.as_ptr()))] async fn do_close(&self) { - tracing::debug!(gate = self.name, "Closing Gate..."); - match self.sem.acquire_many(Self::MAX_UNITS).await { - Ok(_units) => { + tracing::debug!("Closing Gate..."); + + match self.inner.sem.acquire_many(Self::MAX_UNITS).await { + Ok(_permit) => { // While holding all units, close the semaphore. All subsequent calls to enter() will fail. - self.sem.close(); + self.inner.sem.close(); } - Err(_) => { + Err(_closed) => { // Semaphore closed: we are the only function that can do this, so it indicates a double-call. // This is legal. Timeline::shutdown for example is not protected from being called more than // once. - tracing::debug!(gate = self.name, "Double close") + tracing::debug!("Double close") } } - tracing::debug!(gate = self.name, "Closed Gate.") + tracing::debug!("Closed Gate.") } } #[cfg(test)] mod tests { - use futures::FutureExt; - use super::*; #[tokio::test] - async fn test_idle_gate() { - // Having taken no gates, we should not be blocked in close - let gate = Gate::new("test".to_string()); + async fn close_unused() { + // Having taken no guards, we should not be blocked in close + let gate = Gate::default(); gate.close().await; + } + #[tokio::test] + async fn close_idle() { // If a guard is dropped before entering, close should not be blocked - let gate = Gate::new("test".to_string()); + let gate = Gate::default(); let guard = gate.enter().unwrap(); drop(guard); gate.close().await; @@ -152,25 +223,30 @@ mod tests { gate.enter().expect_err("enter should fail after close"); } - #[tokio::test] - async fn test_busy_gate() { - let gate = Gate::new("test".to_string()); + #[tokio::test(start_paused = true)] + async fn close_busy_gate() { + let gate = Gate::default(); + let forever = Duration::from_secs(24 * 7 * 365); - let guard = gate.enter().unwrap(); + let guard = + tracing::info_span!("i am holding back the gate").in_scope(|| gate.enter().unwrap()); let mut close_fut = std::pin::pin!(gate.close()); - // Close should be blocked - assert!(close_fut.as_mut().now_or_never().is_none()); + // Close should be waiting for guards to drop + tokio::time::timeout(forever, &mut close_fut) + .await + .unwrap_err(); // Attempting to enter() should fail, even though close isn't done yet. gate.enter() .expect_err("enter should fail after entering close"); + // this will now log, which we cannot verify except manually drop(guard); // Guard is gone, close should finish - assert!(close_fut.as_mut().now_or_never().is_some()); + close_fut.await; // Attempting to enter() is still forbidden gate.enter().expect_err("enter should fail finishing close"); diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs index 0ccaf4e716..1abd3d9861 100644 --- a/libs/utils/src/sync/heavier_once_cell.rs +++ b/libs/utils/src/sync/heavier_once_cell.rs @@ -69,37 +69,87 @@ impl OnceCell { F: FnOnce(InitPermit) -> Fut, Fut: std::future::Future>, { - let sem = { + loop { + let sem = { + let guard = self.inner.lock().unwrap(); + if guard.value.is_some() { + return Ok(Guard(guard)); + } + guard.init_semaphore.clone() + }; + + { + let permit = { + // increment the count for the duration of queued + let _guard = CountWaitingInitializers::start(self); + sem.acquire().await + }; + + let Ok(permit) = permit else { + let guard = self.inner.lock().unwrap(); + if !Arc::ptr_eq(&sem, &guard.init_semaphore) { + // there was a take_and_deinit in between + continue; + } + assert!( + guard.value.is_some(), + "semaphore got closed, must be initialized" + ); + return Ok(Guard(guard)); + }; + + permit.forget(); + } + + let permit = InitPermit(sem); + let (value, _permit) = factory(permit).await?; + let guard = self.inner.lock().unwrap(); - if guard.value.is_some() { - return Ok(Guard(guard)); - } - guard.init_semaphore.clone() - }; - let permit = { - // increment the count for the duration of queued - let _guard = CountWaitingInitializers::start(self); - sem.acquire_owned().await - }; - - match permit { - Ok(permit) => { - let permit = InitPermit(permit); - let (value, _permit) = factory(permit).await?; + return Ok(Self::set0(value, guard)); + } + } + /// Returns a guard to an existing initialized value, or returns an unique initialization + /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`. + pub async fn get_or_init_detached(&self) -> Result, InitPermit> { + // It looks like OnceCell::get_or_init could be implemented using this method instead of + // duplication. However, that makes the future be !Send due to possibly holding on to the + // MutexGuard over an await point. + loop { + let sem = { let guard = self.inner.lock().unwrap(); + if guard.value.is_some() { + return Ok(Guard(guard)); + } + guard.init_semaphore.clone() + }; - Ok(Self::set0(value, guard)) - } - Err(_closed) => { - let guard = self.inner.lock().unwrap(); - assert!( - guard.value.is_some(), - "semaphore got closed, must be initialized" - ); - return Ok(Guard(guard)); + { + let permit = { + // increment the count for the duration of queued + let _guard = CountWaitingInitializers::start(self); + sem.acquire().await + }; + + let Ok(permit) = permit else { + let guard = self.inner.lock().unwrap(); + if !Arc::ptr_eq(&sem, &guard.init_semaphore) { + // there was a take_and_deinit in between + continue; + } + assert!( + guard.value.is_some(), + "semaphore got closed, must be initialized" + ); + return Ok(Guard(guard)); + }; + + permit.forget(); } + + let permit = InitPermit(sem); + return Err(permit); } } @@ -142,6 +192,14 @@ impl OnceCell { } } + /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never + /// initialized. + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let inner = self.inner.get_mut().unwrap(); + + inner.take_and_deinit() + } + /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete. pub fn initializer_count(&self) -> usize { self.initializers.load(Ordering::Relaxed) @@ -195,30 +253,58 @@ impl<'a, T> Guard<'a, T> { /// /// The permit will be on a semaphore part of the new internal value, and any following /// [`OnceCell::get_or_init`] will wait on it to complete. - pub fn take_and_deinit(&mut self) -> (T, InitPermit) { - let mut swapped = Inner::default(); - let permit = swapped - .init_semaphore - .clone() - .try_acquire_owned() - .expect("we just created this"); - std::mem::swap(&mut *self.0, &mut swapped); - swapped - .value - .map(|v| (v, InitPermit(permit))) + pub fn take_and_deinit(mut self) -> (T, InitPermit) { + self.0 + .take_and_deinit() .expect("guard is not created unless value has been initialized") } } +impl Inner { + pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> { + let value = self.value.take()?; + + let mut swapped = Inner::default(); + let sem = swapped.init_semaphore.clone(); + // acquire and forget right away, moving the control over to InitPermit + sem.try_acquire().expect("we just created this").forget(); + let permit = InitPermit(sem); + std::mem::swap(self, &mut swapped); + Some((value, permit)) + } +} + /// Type held by OnceCell (de)initializing task. -pub struct InitPermit(tokio::sync::OwnedSemaphorePermit); +/// +/// On drop, this type will return the permit. +pub struct InitPermit(Arc); + +impl std::fmt::Debug for InitPermit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let ptr = Arc::as_ptr(&self.0) as *const (); + f.debug_tuple("InitPermit").field(&ptr).finish() + } +} + +impl Drop for InitPermit { + fn drop(&mut self) { + assert_eq!( + self.0.available_permits(), + 0, + "InitPermit should only exist as the unique permit" + ); + self.0.add_permits(1); + } +} #[cfg(test)] mod tests { + use futures::Future; + use super::*; use std::{ convert::Infallible, - sync::atomic::{AtomicUsize, Ordering}, + pin::{pin, Pin}, time::Duration, }; @@ -380,4 +466,138 @@ mod tests { .unwrap(); assert_eq!(*g, "now initialized"); } + + #[tokio::test(start_paused = true)] + async fn reproduce_init_take_deinit_race() { + init_take_deinit_scenario(|cell, factory| { + Box::pin(async { + cell.get_or_init(factory).await.unwrap(); + }) + }) + .await; + } + + type BoxedInitFuture = Pin>>>; + type BoxedInitFunction = Box BoxedInitFuture>; + + /// Reproduce an assertion failure. + /// + /// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`. + /// We currently only have one, but the structure is kept. + async fn init_take_deinit_scenario(init_way: F) + where + F: for<'a> Fn( + &'a OnceCell<&'static str>, + BoxedInitFunction<&'static str, Infallible>, + ) -> Pin + 'a>>, + { + let cell = OnceCell::default(); + + // acquire the init_semaphore only permit to drive initializing tasks in order to waiting + // on the same semaphore. + let permit = cell + .inner + .lock() + .unwrap() + .init_semaphore + .clone() + .try_acquire_owned() + .unwrap(); + + let mut t1 = pin!(init_way( + &cell, + Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })), + )); + + let mut t2 = pin!(init_way( + &cell, + Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })), + )); + + // drive t2 first to the init_semaphore -- the timeout will be hit once t2 future can + // no longer make progress + tokio::select! { + _ = &mut t2 => unreachable!("it cannot get permit"), + _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} + } + + // followed by t1 in the init_semaphore + tokio::select! { + _ = &mut t1 => unreachable!("it cannot get permit"), + _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} + } + + // now let t2 proceed and initialize + drop(permit); + t2.await; + + let (s, permit) = { cell.get().unwrap().take_and_deinit() }; + assert_eq!("t2", s); + + // now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from + // the new one. + tokio::select! { + _ = &mut t1 => unreachable!("it cannot get permit"), + _ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {} + } + + // only now we get to initialize it + drop(permit); + t1.await; + + assert_eq!("t1", *cell.get().unwrap()); + } + + #[tokio::test(start_paused = true)] + async fn detached_init_smoke() { + let target = OnceCell::default(); + + let Err(permit) = target.get_or_init_detached().await else { + unreachable!("it is not initialized") + }; + + tokio::time::timeout( + std::time::Duration::from_secs(3600 * 24 * 7 * 365), + target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }), + ) + .await + .expect_err("should timeout since we are already holding the permit"); + + target.set(42, permit); + + let (_answer, permit) = { + let guard = target + .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) }) + .await + .unwrap(); + + assert_eq!(*guard, 42); + + guard.take_and_deinit() + }; + + assert!(target.get().is_none()); + + target.set(11, permit); + + assert_eq!(*target.get().unwrap(), 11); + } + + #[tokio::test] + async fn take_and_deinit_on_mut() { + use std::convert::Infallible; + + let mut target = OnceCell::::default(); + assert!(target.take_and_deinit().is_none()); + + target + .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) }) + .await + .unwrap(); + + let again = target.take_and_deinit(); + assert!(matches!(again, Some((42, _))), "{again:?}"); + + assert!(target.take_and_deinit().is_none()); + } } diff --git a/libs/utils/src/tcp_listener.rs b/libs/utils/src/tcp_listener.rs index 7666ad138c..6b35d3d63a 100644 --- a/libs/utils/src/tcp_listener.rs +++ b/libs/utils/src/tcp_listener.rs @@ -1,7 +1,6 @@ use std::{ io, net::{TcpListener, ToSocketAddrs}, - os::unix::prelude::AsRawFd, }; use nix::sys::socket::{setsockopt, sockopt::ReuseAddr}; @@ -10,7 +9,7 @@ use nix::sys::socket::{setsockopt, sockopt::ReuseAddr}; pub fn bind(addr: A) -> io::Result { let listener = TcpListener::bind(addr)?; - setsockopt(listener.as_raw_fd(), ReuseAddr, &true)?; + setsockopt(&listener, ReuseAddr, &true)?; Ok(listener) } diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index db17f7d8cd..d24c81ad0b 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -20,13 +20,13 @@ //! //! // Then, in the main code: //! -//! let span = tracing::info_span!("TestSpan", test_id = 1); +//! let span = tracing::info_span!("TestSpan", tenant_id = 1); //! let _guard = span.enter(); //! //! // ... down the call stack //! -//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; -//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]); +//! use utils::tracing_span_assert::{check_fields_present, ConstExtractor}; +//! let extractor = ConstExtractor::new("tenant_id"); //! if let Err(missing) = check_fields_present!([&extractor]) { //! // if you copypaste this to a custom assert method, remember to add #[track_caller] //! // to get the "user" code location for the panic. @@ -45,27 +45,26 @@ pub enum ExtractionResult { } pub trait Extractor: Send + Sync + std::fmt::Debug { - fn name(&self) -> &str; + fn id(&self) -> &str; fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult; } #[derive(Debug)] -pub struct MultiNameExtractor { - name: &'static str, - field_names: [&'static str; L], +pub struct ConstExtractor { + field_name: &'static str, } -impl MultiNameExtractor { - pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor { - MultiNameExtractor { name, field_names } +impl ConstExtractor { + pub const fn new(field_name: &'static str) -> ConstExtractor { + ConstExtractor { field_name } } } -impl Extractor for MultiNameExtractor { - fn name(&self) -> &str { - self.name +impl Extractor for ConstExtractor { + fn id(&self) -> &str { + self.field_name } fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult { - if fields.iter().any(|f| self.field_names.contains(&f.name())) { + if fields.iter().any(|f| f.name() == self.field_name) { ExtractionResult::Present } else { ExtractionResult::Absent @@ -203,19 +202,19 @@ mod tests { } impl<'a> fmt::Debug for MemoryIdentity<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:p}: {}", self.as_ptr(), self.0.name()) + write!(f, "{:p}: {}", self.as_ptr(), self.0.id()) } } struct Setup { _current_thread_subscriber_guard: tracing::subscriber::DefaultGuard, - tenant_extractor: MultiNameExtractor<2>, - timeline_extractor: MultiNameExtractor<2>, + tenant_extractor: ConstExtractor, + timeline_extractor: ConstExtractor, } fn setup_current_thread() -> Setup { - let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]); - let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]); + let tenant_extractor = ConstExtractor::new("tenant_id"); + let timeline_extractor = ConstExtractor::new("timeline_id"); let registry = tracing_subscriber::registry() .with(tracing_subscriber::fmt::layer()) @@ -343,12 +342,12 @@ mod tests { let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); - let extractor = MultiNameExtractor::new("E", ["e"]); + let extractor = ConstExtractor::new("e"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key - let extractor = MultiNameExtractor::new("F", ["foobar"]); + let extractor = ConstExtractor::new("foobar"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } @@ -368,16 +367,14 @@ mod tests { // normally this would work, but without any tracing-subscriber configured, both // check_field_present find nothing let _guard = subspan.enter(); - let extractors: [&dyn Extractor; 2] = [ - &MultiNameExtractor::new("E", ["e"]), - &MultiNameExtractor::new("F", ["f"]), - ]; + let extractors: [&dyn Extractor; 2] = + [&ConstExtractor::new("e"), &ConstExtractor::new("f")]; let res = check_fields_present0(extractors); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); // similarly for a not found key - let extractor = MultiNameExtractor::new("G", ["g"]); + let extractor = ConstExtractor::new("g"); let res = check_fields_present0([&extractor]); assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}"); } @@ -410,7 +407,7 @@ mod tests { let span = tracing::info_span!("foo", e = "some value"); let _guard = span.enter(); - let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])]; + let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")]; if span.is_disabled() { // the tests are running single threaded, or we got lucky and no other tests subscriber diff --git a/libs/utils/src/vec_map.rs b/libs/utils/src/vec_map.rs index 9953b447c8..18b2af14f1 100644 --- a/libs/utils/src/vec_map.rs +++ b/libs/utils/src/vec_map.rs @@ -1,27 +1,60 @@ use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds}; +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum VecMapOrdering { + Greater, + GreaterOrEqual, +} + /// Ordered map datastructure implemented in a Vec. /// Append only - can only add keys that are larger than the /// current max key. +/// Ordering can be adjusted using [`VecMapOrdering`] +/// during `VecMap` construction. #[derive(Clone, Debug)] -pub struct VecMap(Vec<(K, V)>); +pub struct VecMap { + data: Vec<(K, V)>, + ordering: VecMapOrdering, +} impl Default for VecMap { fn default() -> Self { - VecMap(Default::default()) + VecMap { + data: Default::default(), + ordering: VecMapOrdering::Greater, + } } } -#[derive(Debug)] -pub struct InvalidKey; +#[derive(thiserror::Error, Debug)] +pub enum VecMapError { + #[error("Key violates ordering constraint")] + InvalidKey, + #[error("Mismatched ordering constraints")] + ExtendOrderingError, +} impl VecMap { + pub fn new(ordering: VecMapOrdering) -> Self { + Self { + data: Vec::new(), + ordering, + } + } + + pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self { + Self { + data: Vec::with_capacity(capacity), + ordering, + } + } + pub fn is_empty(&self) -> bool { - self.0.is_empty() + self.data.is_empty() } pub fn as_slice(&self) -> &[(K, V)] { - self.0.as_slice() + self.data.as_slice() } /// This function may panic if given a range where the lower bound is @@ -29,7 +62,7 @@ impl VecMap { pub fn slice_range>(&self, range: R) -> &[(K, V)] { use std::ops::Bound::*; - let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key); + let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key); let start_idx = match range.start_bound() { Unbounded => 0, @@ -41,7 +74,7 @@ impl VecMap { }; let end_idx = match range.end_bound() { - Unbounded => self.0.len(), + Unbounded => self.data.len(), Included(k) => match binary_search(k) { Ok(idx) => idx + 1, Err(idx) => idx, @@ -49,34 +82,30 @@ impl VecMap { Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity), }; - &self.0[start_idx..end_idx] + &self.data[start_idx..end_idx] } /// Add a key value pair to the map. - /// If `key` is less than or equal to the current maximum key - /// the pair will not be added and InvalidKey error will be returned. - pub fn append(&mut self, key: K, value: V) -> Result { - if let Some((last_key, _last_value)) = self.0.last() { - if &key <= last_key { - return Err(InvalidKey); - } - } + /// If `key` is not respective of the `self` ordering the + /// pair will not be added and `InvalidKey` error will be returned. + pub fn append(&mut self, key: K, value: V) -> Result { + self.validate_key_order(&key)?; let delta_size = self.instrument_vec_op(|vec| vec.push((key, value))); Ok(delta_size) } /// Update the maximum key value pair or add a new key value pair to the map. - /// If `key` is less than the current maximum key no updates or additions - /// will occur and InvalidKey error will be returned. + /// If `key` is not respective of the `self` ordering no updates or additions + /// will occur and `InvalidKey` error will be returned. pub fn append_or_update_last( &mut self, key: K, mut value: V, - ) -> Result<(Option, usize), InvalidKey> { - if let Some((last_key, last_value)) = self.0.last_mut() { + ) -> Result<(Option, usize), VecMapError> { + if let Some((last_key, last_value)) = self.data.last_mut() { match key.cmp(last_key) { - Ordering::Less => return Err(InvalidKey), + Ordering::Less => return Err(VecMapError::InvalidKey), Ordering::Equal => { std::mem::swap(last_value, &mut value); const DELTA_SIZE: usize = 0; @@ -100,40 +129,67 @@ impl VecMap { V: Clone, { let split_idx = self - .0 + .data .binary_search_by_key(&cutoff, extract_key) .unwrap_or_else(std::convert::identity); ( - VecMap(self.0[..split_idx].to_vec()), - VecMap(self.0[split_idx..].to_vec()), + VecMap { + data: self.data[..split_idx].to_vec(), + ordering: self.ordering, + }, + VecMap { + data: self.data[split_idx..].to_vec(), + ordering: self.ordering, + }, ) } /// Move items from `other` to the end of `self`, leaving `other` empty. - /// If any keys in `other` is less than or equal to any key in `self`, - /// `InvalidKey` error will be returned and no mutation will occur. - pub fn extend(&mut self, other: &mut Self) -> Result { - let self_last_opt = self.0.last().map(extract_key); - let other_first_opt = other.0.last().map(extract_key); + /// If the `other` ordering is different from `self` ordering + /// `ExtendOrderingError` error will be returned. + /// If any keys in `other` is not respective of the ordering defined in + /// `self`, `InvalidKey` error will be returned and no mutation will occur. + pub fn extend(&mut self, other: &mut Self) -> Result { + if self.ordering != other.ordering { + return Err(VecMapError::ExtendOrderingError); + } - if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) { - if self_last >= other_first { - return Err(InvalidKey); + let other_first_opt = other.data.last().map(extract_key); + if let Some(other_first) = other_first_opt { + self.validate_key_order(other_first)?; + } + + let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data)); + Ok(delta_size) + } + + /// Validate the current last key in `self` and key being + /// inserted against the order defined in `self`. + fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> { + if let Some(last_key) = self.data.last().map(extract_key) { + match (&self.ordering, &key.cmp(last_key)) { + (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => { + return Err(VecMapError::InvalidKey); + } + (VecMapOrdering::Greater, Ordering::Greater) => {} + (VecMapOrdering::GreaterOrEqual, Ordering::Less) => { + return Err(VecMapError::InvalidKey); + } + (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {} } } - let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0)); - Ok(delta_size) + Ok(()) } /// Instrument an operation on the underlying [`Vec`]. /// Will panic if the operation decreases capacity. /// Returns the increase in memory usage caused by the op. fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize { - let old_cap = self.0.capacity(); - op(&mut self.0); - let new_cap = self.0.capacity(); + let old_cap = self.data.capacity(); + op(&mut self.data); + let new_cap = self.data.capacity(); match old_cap.cmp(&new_cap) { Ordering::Less => { @@ -145,6 +201,36 @@ impl VecMap { Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"), } } + + /// Similar to `from_iter` defined in `FromIter` trait except + /// that it accepts an [`VecMapOrdering`] + pub fn from_iter>(iter: I, ordering: VecMapOrdering) -> Self { + let iter = iter.into_iter(); + let initial_capacity = { + match iter.size_hint() { + (lower_bound, None) => lower_bound, + (_, Some(upper_bound)) => upper_bound, + } + }; + + let mut vec_map = VecMap::with_capacity(initial_capacity, ordering); + for (key, value) in iter { + vec_map + .append(key, value) + .expect("The passed collection needs to be sorted!"); + } + + vec_map + } +} + +impl IntoIterator for VecMap { + type Item = (K, V); + type IntoIter = std::vec::IntoIter<(K, V)>; + + fn into_iter(self) -> Self::IntoIter { + self.data.into_iter() + } } fn extract_key(entry: &(K, V)) -> &K { @@ -155,7 +241,7 @@ fn extract_key(entry: &(K, V)) -> &K { mod tests { use std::{collections::BTreeMap, ops::Bound}; - use super::VecMap; + use super::{VecMap, VecMapOrdering}; #[test] fn unbounded_range() { @@ -310,5 +396,59 @@ mod tests { left.extend(&mut one_map).unwrap_err(); assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); assert_eq!(one_map.as_slice(), &[(1, ())]); + + let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual); + map_greater_or_equal.append(2, ()).unwrap(); + map_greater_or_equal.append(2, ()).unwrap(); + + left.extend(&mut map_greater_or_equal).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ()), (1, ())]); + assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]); + } + + #[test] + fn extend_with_ordering() { + let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual); + left.append(0, ()).unwrap(); + assert_eq!(left.as_slice(), &[(0, ())]); + + let mut greater_right = VecMap::new(VecMapOrdering::Greater); + greater_right.append(0, ()).unwrap(); + left.extend(&mut greater_right).unwrap_err(); + assert_eq!(left.as_slice(), &[(0, ())]); + + let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual); + greater_or_equal_right.append(2, ()).unwrap(); + greater_or_equal_right.append(2, ()).unwrap(); + left.extend(&mut greater_or_equal_right).unwrap(); + assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]); + } + + #[test] + fn vec_map_from_sorted() { + let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())]; + let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater); + assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]); + + let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]; + let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); + assert_eq!( + vec_map.as_slice(), + &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())] + ); + } + + #[test] + #[should_panic] + fn vec_map_from_unsorted_greater() { + let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())]; + let _ = VecMap::from_iter(vec, VecMapOrdering::Greater); + } + + #[test] + #[should_panic] + fn vec_map_from_unsorted_greater_or_equal() { + let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())]; + let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual); } } diff --git a/libs/utils/src/zstd.rs b/libs/utils/src/zstd.rs new file mode 100644 index 0000000000..be2dcc00f5 --- /dev/null +++ b/libs/utils/src/zstd.rs @@ -0,0 +1,78 @@ +use std::io::SeekFrom; + +use anyhow::{Context, Result}; +use async_compression::{ + tokio::{bufread::ZstdDecoder, write::ZstdEncoder}, + zstd::CParameter, + Level, +}; +use camino::Utf8Path; +use nix::NixPath; +use tokio::{ + fs::{File, OpenOptions}, + io::AsyncBufRead, + io::AsyncSeekExt, + io::AsyncWriteExt, +}; +use tokio_tar::{Archive, Builder, HeaderMode}; +use walkdir::WalkDir; + +/// Creates a Zstandard tarball. +pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> { + let file = OpenOptions::new() + .create(true) + .truncate(true) + .read(true) + .write(true) + .open(&tarball) + .await + .with_context(|| format!("tempfile creation {tarball}"))?; + + let mut paths = Vec::new(); + for entry in WalkDir::new(path) { + let entry = entry?; + let metadata = entry.metadata().expect("error getting dir entry metadata"); + // Also allow directories so that we also get empty directories + if !(metadata.is_file() || metadata.is_dir()) { + continue; + } + let path = entry.into_path(); + paths.push(path); + } + // Do a sort to get a more consistent listing + paths.sort_unstable(); + let zstd = ZstdEncoder::with_quality_and_params( + file, + Level::Default, + &[CParameter::enable_long_distance_matching(true)], + ); + let mut builder = Builder::new(zstd); + // Use reproducible header mode + builder.mode(HeaderMode::Deterministic); + for p in paths { + let rel_path = p.strip_prefix(path)?; + if rel_path.is_empty() { + // The top directory should not be compressed, + // the tar crate doesn't like that + continue; + } + builder.append_path_with_name(&p, rel_path).await?; + } + let mut zstd = builder.into_inner().await?; + zstd.shutdown().await?; + let mut compressed = zstd.into_inner(); + let compressed_len = compressed.metadata().await?.len(); + compressed.seek(SeekFrom::Start(0)).await?; + Ok((compressed, compressed_len)) +} + +/// Creates a Zstandard tarball. +pub async fn extract_zst_tarball( + path: &Utf8Path, + tarball: impl AsyncBufRead + Unpin, +) -> Result<()> { + let decoder = Box::pin(ZstdDecoder::new(tarball)); + let mut archive = Archive::new(decoder); + archive.unpack(path).await?; + Ok(()) +} diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs index 7160a42df2..3223765016 100644 --- a/libs/vm_monitor/src/cgroup.rs +++ b/libs/vm_monitor/src/cgroup.rs @@ -25,6 +25,8 @@ pub struct Config { /// /// For simplicity, this value must be greater than or equal to `memory_history_len`. memory_history_log_interval: usize, + /// The max number of iterations to skip before logging the next iteration + memory_history_log_noskip_interval: Duration, } impl Default for Config { @@ -33,6 +35,7 @@ impl Default for Config { memory_poll_interval: Duration::from_millis(100), memory_history_len: 5, // use 500ms of history for decision-making memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy) + memory_history_log_noskip_interval: Duration::from_secs(15), // but only if it's changed, or 60 seconds have passed } } } @@ -85,7 +88,12 @@ impl CgroupWatcher { // buffer for samples that will be logged. once full, it remains so. let history_log_len = self.config.memory_history_log_interval; + let max_skip = self.config.memory_history_log_noskip_interval; let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len]; + let mut last_logged_memusage = MemoryStatus::zeroed(); + + // Ensure that we're tracking a value that's definitely in the past, as Instant::now is only guaranteed to be non-decreasing on Rust's T1-supported systems. + let mut can_skip_logs_until = Instant::now() - max_skip; for t in 0_u64.. { ticker.tick().await; @@ -115,12 +123,24 @@ impl CgroupWatcher { // equal to the logging interval, we can just log the entire buffer every time we set // the last entry, which also means that for this log line, we can ignore that it's a // ring buffer (because all the entries are in order of increasing time). - if i == history_log_len - 1 { + // + // We skip logging the data if data hasn't meaningfully changed in a while, unless + // we've already ignored previous iterations for the last max_skip period. + if i == history_log_len - 1 + && (now > can_skip_logs_until + || !history_log_buf + .iter() + .all(|usage| last_logged_memusage.status_is_close_or_similar(usage))) + { info!( history = ?MemoryStatus::debug_slice(&history_log_buf), summary = ?summary, "Recent cgroup memory statistics history" ); + + can_skip_logs_until = now + max_skip; + + last_logged_memusage = *history_log_buf.last().unwrap(); } updates @@ -232,6 +252,24 @@ impl MemoryStatus { DS(slice) } + + /// Check if the other memory status is a close or similar result. + /// Returns true if the larger value is not larger than the smaller value + /// by 1/8 of the smaller value, and within 128MiB. + /// See tests::check_similarity_behaviour for examples of behaviour + fn status_is_close_or_similar(&self, other: &MemoryStatus) -> bool { + let margin; + let diff; + if self.non_reclaimable >= other.non_reclaimable { + margin = other.non_reclaimable / 8; + diff = self.non_reclaimable - other.non_reclaimable; + } else { + margin = self.non_reclaimable / 8; + diff = other.non_reclaimable - self.non_reclaimable; + } + + diff < margin && diff < 128 * 1024 * 1024 + } } #[cfg(test)] @@ -261,4 +299,65 @@ mod tests { assert_eq!(values(2, 4), [9, 0, 1, 2]); assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]); } + + #[test] + fn check_similarity_behaviour() { + // This all accesses private methods, so we can't actually run this + // as doctests, because doctests run as an external crate. + let mut small = super::MemoryStatus { + non_reclaimable: 1024, + }; + let mut large = super::MemoryStatus { + non_reclaimable: 1024 * 1024 * 1024 * 1024, + }; + + // objects are self-similar, no matter the size + assert!(small.status_is_close_or_similar(&small)); + assert!(large.status_is_close_or_similar(&large)); + + // inequality is symmetric + assert!(!small.status_is_close_or_similar(&large)); + assert!(!large.status_is_close_or_similar(&small)); + + small.non_reclaimable = 64; + large.non_reclaimable = (small.non_reclaimable / 8) * 9; + + // objects are self-similar, no matter the size + assert!(small.status_is_close_or_similar(&small)); + assert!(large.status_is_close_or_similar(&large)); + + // values are similar if the larger value is larger by less than + // 12.5%, i.e. 1/8 of the smaller value. + // In the example above, large is exactly 12.5% larger, so this doesn't + // match. + assert!(!small.status_is_close_or_similar(&large)); + assert!(!large.status_is_close_or_similar(&small)); + + large.non_reclaimable -= 1; + assert!(large.status_is_close_or_similar(&large)); + + assert!(small.status_is_close_or_similar(&large)); + assert!(large.status_is_close_or_similar(&small)); + + // The 1/8 rule only applies up to 128MiB of difference + small.non_reclaimable = 1024 * 1024 * 1024 * 1024; + large.non_reclaimable = small.non_reclaimable / 8 * 9; + assert!(small.status_is_close_or_similar(&small)); + assert!(large.status_is_close_or_similar(&large)); + + assert!(!small.status_is_close_or_similar(&large)); + assert!(!large.status_is_close_or_similar(&small)); + // the large value is put just above the threshold + large.non_reclaimable = small.non_reclaimable + 128 * 1024 * 1024; + assert!(large.status_is_close_or_similar(&large)); + + assert!(!small.status_is_close_or_similar(&large)); + assert!(!large.status_is_close_or_similar(&small)); + // now below + large.non_reclaimable -= 1; + assert!(large.status_is_close_or_similar(&large)); + + assert!(small.status_is_close_or_similar(&large)); + assert!(large.status_is_close_or_similar(&small)); + } } diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs index c76baf04e7..6a965ace9b 100644 --- a/libs/vm_monitor/src/dispatcher.rs +++ b/libs/vm_monitor/src/dispatcher.rs @@ -12,11 +12,11 @@ use futures::{ stream::{SplitSink, SplitStream}, SinkExt, StreamExt, }; -use tracing::info; +use tracing::{debug, info}; use crate::protocol::{ - OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION, - PROTOCOL_MIN_VERSION, + OutboundMsg, OutboundMsgKind, ProtocolRange, ProtocolResponse, ProtocolVersion, + PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION, }; /// The central handler for all communications in the monitor. @@ -118,7 +118,12 @@ impl Dispatcher { /// serialize the wrong thing and send it, since `self.sink.send` will take /// any string. pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> { - info!(?message, "sending message"); + if matches!(&message.inner, OutboundMsgKind::HealthCheck { .. }) { + debug!(?message, "sending message"); + } else { + info!(?message, "sending message"); + } + let json = serde_json::to_string(&message).context("failed to serialize message")?; self.sink .send(Message::Text(json)) diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs index ba37966476..36f8573a38 100644 --- a/libs/vm_monitor/src/runner.rs +++ b/libs/vm_monitor/src/runner.rs @@ -12,7 +12,7 @@ use axum::extract::ws::{Message, WebSocket}; use futures::StreamExt; use tokio::sync::{broadcast, watch}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; use crate::cgroup::{self, CgroupWatcher}; use crate::dispatcher::Dispatcher; @@ -69,7 +69,7 @@ pub struct Config { /// should be removed once we have a better solution there. sys_buffer_bytes: u64, - /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in + /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in /// other words, providing a ceiling for the highest value of the threshold by enforcing that /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the /// threshold. @@ -474,26 +474,29 @@ impl Runner { // there is a message from the agent msg = self.dispatcher.source.next() => { if let Some(msg) = msg { - // Don't use 'message' as a key as the string also uses - // that for its key - info!(?msg, "received message"); - match msg { + match &msg { Ok(msg) => { let message: InboundMsg = match msg { Message::Text(text) => { - serde_json::from_str(&text).context("failed to deserialize text message")? + serde_json::from_str(text).context("failed to deserialize text message")? } other => { warn!( // Don't use 'message' as a key as the // string also uses that for its key msg = ?other, - "agent should only send text messages but received different type" + "problem processing incoming message: agent should only send text messages but received different type" ); continue }, }; + if matches!(&message.inner, InboundMsgKind::HealthCheck { .. }) { + debug!(?msg, "received message"); + } else { + info!(?msg, "received message"); + } + let out = match self.process_message(message.clone()).await { Ok(Some(out)) => out, Ok(None) => continue, @@ -517,7 +520,11 @@ impl Runner { .await .context("failed to send message")?; } - Err(e) => warn!("{e}"), + Err(e) => warn!( + error = format!("{e}"), + msg = ?msg, + "received error message" + ), } } else { anyhow::bail!("dispatcher connection closed") diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs index fd09030dbd..3126b170a4 100644 --- a/libs/walproposer/build.rs +++ b/libs/walproposer/build.rs @@ -34,6 +34,9 @@ fn main() -> anyhow::Result<()> { println!("cargo:rustc-link-lib=static=walproposer"); println!("cargo:rustc-link-search={walproposer_lib_search_str}"); + // Rebuild crate when libwalproposer.a changes + println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a"); + let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config"); let inc_server_path: String = if pg_config_bin.exists() { let output = Command::new(pg_config_bin) @@ -79,6 +82,7 @@ fn main() -> anyhow::Result<()> { .allowlist_function("WalProposerBroadcast") .allowlist_function("WalProposerPoll") .allowlist_function("WalProposerFree") + .allowlist_function("SafekeeperStateDesiredEvents") .allowlist_var("DEBUG5") .allowlist_var("DEBUG4") .allowlist_var("DEBUG3") diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index 1f7bf952dc..bbc3663402 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -22,6 +22,7 @@ use crate::bindings::WalProposerExecStatusType; use crate::bindings::WalproposerShmemState; use crate::bindings::XLogRecPtr; use crate::walproposer::ApiImpl; +use crate::walproposer::StreamingCallback; use crate::walproposer::WaitResult; extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState { @@ -36,7 +37,8 @@ extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; - (*api).start_streaming(startpos) + let callback = StreamingCallback::new(wp); + (*api).start_streaming(startpos, &callback); } } @@ -48,6 +50,14 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr { } } +extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) { + unsafe { + let callback_data = (*(*wp).config).callback_data; + let api = callback_data as *mut Box; + (*api).update_donor(&mut (*donor), donor_lsn) + } +} + extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz { unsafe { let callback_data = (*(*wp).config).callback_data; @@ -134,19 +144,18 @@ extern "C" fn conn_async_read( unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; - let (res, result) = (*api).conn_async_read(&mut (*sk)); // This function has guarantee that returned buf will be valid until // the next call. So we can store a Vec in each Safekeeper and reuse // it on the next call. let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default(); - inbuf.clear(); - inbuf.extend_from_slice(res); + + let result = (*api).conn_async_read(&mut (*sk), &mut inbuf); // Put a Vec back to sk->inbuf and return data ptr. + *amount = inbuf.len() as i32; *buf = store_vec_u8(&mut (*sk).inbuf, inbuf); - *amount = res.len() as i32; result } @@ -182,6 +191,10 @@ extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bo unsafe { let callback_data = (*(*(*sk).wp).config).callback_data; let api = callback_data as *mut Box; + + // currently `recovery_download` is always called right after election + (*api).after_election(&mut (*wp)); + (*api).recovery_download(&mut (*wp), &mut (*sk)) } } @@ -277,7 +290,8 @@ extern "C" fn wait_event_set( } WaitResult::Timeout => { *event_sk = std::ptr::null_mut(); - *events = crate::bindings::WL_TIMEOUT; + // WaitEventSetWait returns 0 for timeout. + *events = 0; 0 } WaitResult::Network(sk, event_mask) => { @@ -318,11 +332,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) { } } -extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) { +extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) { unsafe { let callback_data = (*(*wp).config).callback_data; let api = callback_data as *mut Box; - (*api).process_safekeeper_feedback(&mut (*wp), commit_lsn) + (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk)); } } @@ -340,7 +354,7 @@ extern "C" fn log_internal( } } -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum Level { Debug5, Debug4, @@ -385,6 +399,7 @@ pub(crate) fn create_api() -> walproposer_api { get_shmem_state: Some(get_shmem_state), start_streaming: Some(start_streaming), get_flush_rec_ptr: Some(get_flush_rec_ptr), + update_donor: Some(update_donor), get_current_timestamp: Some(get_current_timestamp), conn_error_message: Some(conn_error_message), conn_status: Some(conn_status), @@ -415,6 +430,32 @@ pub(crate) fn create_api() -> walproposer_api { } } +pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { + let empty_feedback = crate::bindings::PageserverFeedback { + present: false, + currentClusterSize: 0, + last_received_lsn: 0, + disk_consistent_lsn: 0, + remote_consistent_lsn: 0, + replytime: 0, + shard_number: 0, + }; + + crate::bindings::WalproposerShmemState { + propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 }, + donor_name: [0; 64], + donor_conninfo: [0; 1024], + donor_lsn: 0, + mutex: 0, + mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 }, + backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 }, + currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 }, + shard_ps_feedback: [empty_feedback; 128], + num_shards: 0, + min_ps_feedback: empty_feedback, + } +} + impl std::fmt::Display for Level { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{:?}", self) diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index 7251545792..f7b72b205f 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -1,26 +1,25 @@ use std::ffi::CString; -use postgres_ffi::WAL_SEGMENT_SIZE; -use utils::id::TenantTimelineId; - use crate::{ api_bindings::{create_api, take_vec_u8, Level}, bindings::{ - NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, - WalProposerFree, WalProposerStart, + NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig, + WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart, }, }; +use postgres_ffi::WAL_SEGMENT_SIZE; +use utils::{id::TenantTimelineId, lsn::Lsn}; /// Rust high-level wrapper for C walproposer API. Many methods are not required /// for simple cases, hence todo!() in default implementations. /// /// Refer to `pgxn/neon/walproposer.h` for documentation. pub trait ApiImpl { - fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState { + fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState { todo!() } - fn start_streaming(&self, _startpos: u64) { + fn start_streaming(&self, _startpos: u64, _callback: &StreamingCallback) { todo!() } @@ -28,6 +27,10 @@ pub trait ApiImpl { todo!() } + fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) { + todo!() + } + fn get_current_timestamp(&self) -> i64 { todo!() } @@ -70,7 +73,11 @@ pub trait ApiImpl { todo!() } - fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) { + fn conn_async_read( + &self, + _sk: &mut Safekeeper, + _vec: &mut Vec, + ) -> crate::bindings::PGAsyncReadResult { todo!() } @@ -138,7 +145,7 @@ pub trait ApiImpl { todo!() } - fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) { + fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) { todo!() } @@ -151,12 +158,14 @@ pub trait ApiImpl { } } +#[derive(Debug)] pub enum WaitResult { Latch, Timeout, Network(*mut Safekeeper, u32), } +#[derive(Clone)] pub struct Config { /// Tenant and timeline id pub ttid: TenantTimelineId, @@ -242,6 +251,24 @@ impl Drop for Wrapper { } } +pub struct StreamingCallback { + wp: *mut WalProposer, +} + +impl StreamingCallback { + pub fn new(wp: *mut WalProposer) -> StreamingCallback { + StreamingCallback { wp } + } + + pub fn broadcast(&self, startpos: Lsn, endpos: Lsn) { + unsafe { WalProposerBroadcast(self.wp, startpos.0, endpos.0) } + } + + pub fn poll(&self) { + unsafe { WalProposerPoll(self.wp) } + } +} + #[cfg(test)] mod tests { use core::panic; @@ -250,6 +277,7 @@ mod tests { sync::{atomic::AtomicUsize, mpsc::sync_channel}, }; + use std::cell::UnsafeCell; use utils::id::TenantTimelineId; use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper}; @@ -273,6 +301,8 @@ mod tests { replies_ptr: AtomicUsize, // channel to send LSN to the main thread sync_channel: std::sync::mpsc::SyncSender, + // Shmem state, used for storing donor info + shmem: UnsafeCell, } impl MockImpl { @@ -303,11 +333,22 @@ mod tests { } impl ApiImpl for MockImpl { + fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState { + self.shmem.get() + } + fn get_current_timestamp(&self) -> i64 { println!("get_current_timestamp"); 0 } + fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) { + let mut shmem = unsafe { *self.get_shmem_state() }; + shmem.propEpochStartLsn.value = donor_lsn; + shmem.donor_conninfo = donor.conninfo; + shmem.donor_lsn = donor_lsn; + } + fn conn_status( &self, _: &mut crate::bindings::Safekeeper, @@ -344,14 +385,13 @@ mod tests { fn conn_async_read( &self, _: &mut crate::bindings::Safekeeper, - ) -> (&[u8], crate::bindings::PGAsyncReadResult) { + vec: &mut Vec, + ) -> crate::bindings::PGAsyncReadResult { println!("conn_async_read"); let reply = self.next_safekeeper_reply(); println!("conn_async_read result: {:?}", reply); - ( - reply, - crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS, - ) + vec.extend_from_slice(reply); + crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS } fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool { @@ -453,9 +493,12 @@ mod tests { event_mask: 0, }), expected_messages: vec![ - // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) + // TODO: When updating Postgres versions, this test will cause + // problems. Postgres version in message needs updating. + // + // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 }) vec![ - 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147, 188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1, @@ -481,6 +524,7 @@ mod tests { ], replies_ptr: AtomicUsize::new(0), sync_channel: sender, + shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()), }); let config = crate::walproposer::Config { ttid, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 980fbab22e..4335f38f1e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,7 @@ testing = ["fail/failpoints"] [dependencies] anyhow.workspace = true +arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true async-trait.workspace = true @@ -21,7 +22,6 @@ camino.workspace = true camino-tempfile.workspace = true chrono = { workspace = true, features = ["serde"] } clap = { workspace = true, features = ["string"] } -close_fds.workspace = true const_format.workspace = true consumption_metrics.workspace = true crc32c.workspace = true @@ -36,6 +36,7 @@ humantime.workspace = true humantime-serde.workspace = true hyper.workspace = true itertools.workspace = true +leaky-bucket.workspace = true md5.workspace = true nix.workspace = true # hack to get the number of worker threads tokio uses @@ -58,19 +59,23 @@ signal-hook.workspace = true smallvec = { workspace = true, features = ["write"] } svg_fmt.workspace = true sync_wrapper.workspace = true +sysinfo.workspace = true tokio-tar.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } +tokio-epoll-uring.workspace = true tokio-io-timeout.workspace = true tokio-postgres.workspace = true tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true +twox-hash.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true pageserver_api.workspace = true +pageserver_compaction.workspace = true postgres_connection.workspace = true postgres_ffi.workspace = true pq_proto.workspace = true @@ -82,10 +87,13 @@ workspace_hack.workspace = true reqwest.workspace = true rpds.workspace = true enum-map.workspace = true -enumset.workspace = true +enumset = { workspace = true, features = ["serde"]} strum.workspace = true strum_macros.workspace = true +[target.'cfg(target_os = "linux")'.dependencies] +procfs.workspace = true + [dev-dependencies] criterion.workspace = true hex-literal.workspace = true diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 5d05af0c00..1d02aa7709 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,7 +1,7 @@ use pageserver::keyspace::{KeyPartitioning, KeySpace}; use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::storage_layer::PersistentLayerDesc; use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; @@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap { let mut updates = layer_map.batch_update(); for fname in filenames { let fname = fname.unwrap(); - let fname = LayerFileName::from_str(&fname).unwrap(); + let fname = LayerName::from_str(&fname).unwrap(); let layer = PersistentLayerDesc::from(fname); let lsn_range = layer.get_lsn_range(); diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 4837626086..5aab10e5d9 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -1,191 +1,156 @@ -//! Simple benchmarking around walredo. +//! Quantify a single walredo manager's throughput under N concurrent callers. //! -//! Right now they hope to just set a baseline. Later we can try to expand into latency and -//! throughput after figuring out the coordinated omission problems below. +//! The benchmark implementation ([`bench_impl`]) is parametrized by +//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`] +//! - `n_redos` => number of times the benchmark shell execute the `redo_work` +//! - `nclients` => number of clients (more on this shortly). //! -//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by -//! logging what happens when a sequential scan is requested on a small table, then picking out two -//! suitable from logs. - -use std::sync::{Arc, Barrier}; +//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters. +//! It spawns `nclients` times [`client`] tokio tasks. +//! Each task executes the `redo_work` `n_redos/nclients` times. +//! +//! We exercise the following combinations: +//! - `redo_work = short / medium`` +//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]` +//! +//! We let `criterion` determine the `n_redos` using `iter_custom`. +//! The idea is that for each `(redo_work, nclients)` combination, +//! criterion will run the `bench_impl` multiple times with different `n_redos`. +//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective. +//! Criterion will divide that by `n_redos` to compute the "time per iteration". +//! In our case, "time per iteration" means "time per redo_work execution". +//! +//! NB: the way by which `iter_custom` determines the "number of iterations" +//! is called sampling. Apparently the idea here is to detect outliers. +//! We're not sure whether the current choice of sampling method makes sense. +//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples +//! +//! # Reference Numbers +//! +//! 2024-04-15 on i3en.3xlarge +//! +//! ```text +//! short/1 time: [24.584 µs 24.737 µs 24.922 µs] +//! short/2 time: [33.479 µs 33.660 µs 33.888 µs] +//! short/4 time: [42.713 µs 43.046 µs 43.440 µs] +//! short/8 time: [71.814 µs 72.478 µs 73.240 µs] +//! short/16 time: [132.73 µs 134.45 µs 136.22 µs] +//! short/32 time: [258.31 µs 260.73 µs 263.27 µs] +//! short/64 time: [511.61 µs 514.44 µs 517.51 µs] +//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms] +//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs] +//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs] +//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs] +//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs] +//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms] +//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms] +//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms] +//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] +//! ``` use bytes::{Buf, Bytes}; -use pageserver::{ - config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, +use criterion::{BenchmarkId, Criterion}; +use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; +use pageserver_api::{key::Key, shard::TenantShardId}; +use std::{ + sync::Arc, + time::{Duration, Instant}, }; -use pageserver_api::shard::TenantShardId; +use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; - -fn redo_scenarios(c: &mut Criterion) { - // logging should be enabled when adding more inputs, since walredo will only report malformed - // input to the stderr. - // utils::logging::init(utils::logging::LogFormat::Plain).unwrap(); +fn bench(c: &mut Criterion) { + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("short"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("medium"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } +} +criterion::criterion_group!(benches, bench); +criterion::criterion_main!(benches); +// Returns the sum of each client's wall-clock time spent executing their share of the n_redos. +fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_shard_id); + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + let start = Arc::new(Barrier::new(nclients as usize)); + + let mut tasks = JoinSet::new(); + + let manager = PostgresRedoManager::new(conf, tenant_shard_id); let manager = Arc::new(manager); - { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - tracing::info!("executing first"); - short().execute(rt.handle(), &manager).unwrap(); - tracing::info!("first executed"); + // divide the amount of work equally among the clients. + let nredos_per_client = n_redos / nclients; + for _ in 0..nclients { + rt.block_on(async { + tasks.spawn(client( + Arc::clone(&manager), + Arc::clone(&start), + Arc::clone(&redo_work), + nredos_per_client, + )) + }); } - let thread_counts = [1, 2, 4, 8, 16]; - - let mut group = c.benchmark_group("short"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("short", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, short); - }, - ); - } - drop(group); - - let mut group = c.benchmark_group("medium"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("medium", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium); - }, - ); - } - drop(group); -} - -/// Sets up `threads` number of requesters to `request_redo`, with the given input. -fn add_multithreaded_walredo_requesters( - b: &mut criterion::Bencher, - threads: u32, - manager: &Arc, - input_factory: fn() -> Request, -) { - assert_ne!(threads, 0); - - if threads == 1 { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let handle = rt.handle(); - b.iter_batched_ref( - || Some(input_factory()), - |input| execute_all(input.take(), handle, manager), - criterion::BatchSize::PerIteration, - ); - } else { - let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize); - - let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx)); - - let barrier = Arc::new(Barrier::new(threads as usize + 1)); - - let jhs = (0..threads) - .map(|_| { - std::thread::spawn({ - let manager = manager.clone(); - let barrier = barrier.clone(); - let work_rx = work_rx.clone(); - move || { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let handle = rt.handle(); - loop { - // queue up and wait if we want to go another round - if work_rx.lock().unwrap().recv().is_err() { - break; - } - - let input = Some(input_factory()); - - barrier.wait(); - - execute_all(input, handle, &manager).unwrap(); - - barrier.wait(); - } - } - }) - }) - .collect::>(); - - let _jhs = JoinOnDrop(jhs); - - b.iter_batched( - || { - for _ in 0..threads { - work_tx.send(()).unwrap() - } - }, - |()| { - // start the work - barrier.wait(); - - // wait for work to complete - barrier.wait(); - }, - criterion::BatchSize::PerIteration, - ); - - drop(work_tx); - } -} - -struct JoinOnDrop(Vec>); - -impl Drop for JoinOnDrop { - // it's not really needless because we want join all then check for panicks - #[allow(clippy::needless_collect)] - fn drop(&mut self) { - // first join all - let results = self.0.drain(..).map(|jh| jh.join()).collect::>(); - // then check the results; panicking here is not great, but it does get the message across - // to the user, and sets an exit value. - results.into_iter().try_for_each(|res| res).unwrap(); - } -} - -fn execute_all( - input: I, - handle: &tokio::runtime::Handle, - manager: &PostgresRedoManager, -) -> anyhow::Result<()> -where - I: IntoIterator, -{ - // just fire all requests as fast as possible - input.into_iter().try_for_each(|req| { - let page = req.execute(handle, manager)?; - assert_eq!(page.remaining(), 8192); - anyhow::Ok(()) + rt.block_on(async move { + let mut total_wallclock_time = Duration::ZERO; + while let Some(res) = tasks.join_next().await { + total_wallclock_time += res.unwrap(); + } + total_wallclock_time }) } -criterion_group!(benches, redo_scenarios); -criterion_main!(benches); +async fn client( + mgr: Arc, + start: Arc, + redo_work: Arc, + n_redos: u64, +) -> Duration { + start.wait().await; + let start = Instant::now(); + for _ in 0..n_redos { + let page = redo_work.execute(&mgr).await.unwrap(); + assert_eq!(page.remaining(), 8192); + // The real pageserver will rarely if ever do 2 walredos in a row without + // yielding to the executor. + tokio::task::yield_now().await; + } + start.elapsed() +} macro_rules! lsn { ($input:expr) => {{ @@ -197,12 +162,46 @@ macro_rules! lsn { }}; } -/// Short payload, 1132 bytes. -// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 -// for null bytes. -#[allow(clippy::octal_escapes)] -fn short() -> Request { - Request { +/// Simple wrapper around `WalRedoManager::request_redo`. +/// +/// In benchmarks this is cloned around. +#[derive(Clone)] +struct Request { + key: Key, + lsn: Lsn, + base_img: Option<(Lsn, Bytes)>, + records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, +} + +impl Request { + async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result { + let Request { + key, + lsn, + base_img, + records, + pg_version, + } = self; + + // TODO: avoid these clones + manager + .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) + .await + } + + fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { + let rec = Bytes::from_static(bytes); + NeonWalRecord::Postgres { will_init, rec } + } + + /// Short payload, 1132 bytes. + // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 + // for null bytes. + #[allow(clippy::octal_escapes)] + pub fn short_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -225,13 +224,14 @@ fn short() -> Request { ], pg_version: 14, } -} + } -/// Medium sized payload, serializes as 26393 bytes. -// see [`short`] -#[allow(clippy::octal_escapes)] -fn medium() -> Request { - Request { + /// Medium sized payload, serializes as 26393 bytes. + // see [`short`] + #[allow(clippy::octal_escapes)] + pub fn medium_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -473,39 +473,5 @@ fn medium() -> Request { ], pg_version: 14, } -} - -fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { - let rec = Bytes::from_static(bytes); - NeonWalRecord::Postgres { will_init, rec } -} - -/// Simple wrapper around `WalRedoManager::request_redo`. -/// -/// In benchmarks this is cloned around. -#[derive(Clone)] -struct Request { - key: Key, - lsn: Lsn, - base_img: Option<(Lsn, Bytes)>, - records: Vec<(Lsn, NeonWalRecord)>, - pg_version: u32, -} - -impl Request { - fn execute( - self, - rt: &tokio::runtime::Handle, - manager: &PostgresRedoManager, - ) -> anyhow::Result { - let Request { - key, - lsn, - base_img, - records, - pg_version, - } = self; - - rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version)) } } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 4c285293f7..69b86d9c46 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,13 +1,17 @@ +use std::collections::HashMap; + +use bytes::Bytes; use pageserver_api::{models::*, shard::TenantShardId}; -use reqwest::{IntoUrl, Method}; +use reqwest::{IntoUrl, Method, StatusCode}; use utils::{ http::error::HttpErrorBody, id::{TenantId, TimelineId}, + lsn::Lsn, }; pub mod util; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Client { mgmt_api_endpoint: String, authorization_header: Option, @@ -22,20 +26,21 @@ pub enum Error { #[error("receive error body: {0}")] ReceiveErrorBody(String), - #[error("pageserver API: {0}")] - ApiError(String), + #[error("pageserver API: {1}")] + ApiError(StatusCode, String), + + #[error("Cancelled")] + Cancelled, } pub type Result = std::result::Result; -#[async_trait::async_trait] pub trait ResponseErrorMessageExt: Sized { - async fn error_from_body(self) -> Result; + fn error_from_body(self) -> impl std::future::Future> + Send; } -#[async_trait::async_trait] impl ResponseErrorMessageExt for reqwest::Response { - async fn error_from_body(mut self) -> Result { + async fn error_from_body(self) -> Result { let status = self.status(); if !(status.is_client_error() || status.is_server_error()) { return Ok(self); @@ -43,7 +48,7 @@ impl ResponseErrorMessageExt for reqwest::Response { let url = self.url().to_owned(); Err(match self.json::().await { - Ok(HttpErrorBody { msg }) => Error::ApiError(msg), + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), Err(_) => { Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url)) } @@ -51,12 +56,25 @@ impl ResponseErrorMessageExt for reqwest::Response { } } +pub enum ForceAwaitLogicalSize { + Yes, + No, +} + impl Client { pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { + Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt) + } + + pub fn from_client( + client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option<&str>, + ) -> Self { Self { mgmt_api_endpoint, authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")), - client: reqwest::Client::new(), + client, } } @@ -66,11 +84,30 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + /// Get an arbitrary path and returning a streaming Response. This function is suitable + /// for pass-through/proxy use cases where we don't care what the response content looks + /// like. + /// + /// Use/add one of the properly typed methods below if you know aren't proxying, and + /// know what kind of response you expect. + pub async fn get_raw(&self, path: String) -> Result { + debug_assert!(path.starts_with('/')); + let uri = format!("{}{}", self.mgmt_api_endpoint, path); + + let req = self.client.request(Method::GET, uri); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + }; + req.send().await.map_err(Error::ReceiveBody) + } + pub async fn tenant_details( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, ) -> Result { - let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint); + let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint); self.get(uri) .await? .json() @@ -80,23 +117,10 @@ impl Client { pub async fn list_timelines( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, ) -> Result> { - let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint); - self.get(&uri) - .await? - .json() - .await - .map_err(Error::ReceiveBody) - } - - pub async fn timeline_info( - &self, - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + "{}/v1/tenant/{tenant_shard_id}/timeline", self.mgmt_api_endpoint ); self.get(&uri) @@ -106,13 +130,36 @@ impl Client { .map_err(Error::ReceiveBody) } + pub async fn timeline_info( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + force_await_logical_size: ForceAwaitLogicalSize, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", + self.mgmt_api_endpoint + ); + + let uri = match force_await_logical_size { + ForceAwaitLogicalSize::Yes => format!("{}?force-await-logical-size={}", uri, true), + ForceAwaitLogicalSize::No => uri, + }; + + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn keyspace( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace", + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace", self.mgmt_api_endpoint ); self.get(&uri) @@ -126,7 +173,7 @@ impl Client { self.request(Method::GET, uri, ()).await } - async fn request( + async fn request_noerror( &self, method: Method, uri: U, @@ -138,7 +185,16 @@ impl Client { } else { req }; - let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?; + req.json(&body).send().await.map_err(Error::ReceiveBody) + } + + async fn request( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let res = self.request_noerror(method, uri, body).await?; let response = res.error_from_body().await?; Ok(response) } @@ -158,52 +214,169 @@ impl Client { .map_err(Error::ReceiveBody) } + /// The tenant deletion API can return 202 if deletion is incomplete, or + /// 404 if it is complete. Callers are responsible for checking the status + /// code and retrying. Error codes other than 404 will return Err(). + pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result { + let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint); + + match self.request(Method::DELETE, &uri, ()).await { + Err(Error::ApiError(status_code, msg)) => { + if status_code == StatusCode::NOT_FOUND { + Ok(StatusCode::NOT_FOUND) + } else { + Err(Error::ApiError(status_code, msg)) + } + } + Err(e) => Err(e), + Ok(response) => Ok(response.status()), + } + } + + pub async fn tenant_time_travel_remote_storage( + &self, + tenant_shard_id: TenantShardId, + timestamp: &str, + done_if_after: &str, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/time_travel_remote_storage?travel_to={timestamp}&done_if_after={done_if_after}", + self.mgmt_api_endpoint + ); + self.request(Method::PUT, &uri, ()).await?; + Ok(()) + } + + pub async fn tenant_scan_remote_storage( + &self, + tenant_id: TenantId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_id}/scan_remote_storage", + self.mgmt_api_endpoint + ); + let response = self.request(Method::GET, &uri, ()).await?; + let body = response.json().await.map_err(Error::ReceiveBody)?; + Ok(body) + } + pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> { let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint); self.request(Method::PUT, &uri, req).await?; Ok(()) } - pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> { - let uri = format!( + pub async fn tenant_secondary_download( + &self, + tenant_id: TenantShardId, + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress)> { + let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/secondary/download", self.mgmt_api_endpoint, tenant_id - ); - self.request(Method::POST, &uri, ()) + )) + .expect("Cannot build URL"); + + if let Some(wait) = wait { + path.query_pairs_mut() + .append_pair("wait_ms", &format!("{}", wait.as_millis())); + } + + let response = self.request(Method::POST, path, ()).await?; + let status = response.status(); + let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?; + Ok((status, progress)) + } + + pub async fn tenant_secondary_status( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + let path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/secondary/status", + self.mgmt_api_endpoint, tenant_shard_id + )) + .expect("Cannot build URL"); + + self.request(Method::GET, path, ()) .await? - .error_for_status() - .map(|_| ()) - .map_err(|e| Error::ApiError(format!("{}", e))) + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> { + let path = reqwest::Url::parse(&format!( + "{}/v1/tenant/{}/heatmap_upload", + self.mgmt_api_endpoint, tenant_id + )) + .expect("Cannot build URL"); + + self.request(Method::POST, path, ()).await?; + Ok(()) } pub async fn location_config( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, config: LocationConfig, flush_ms: Option, + lazy: bool, ) -> Result<()> { - let req_body = TenantLocationConfigRequest { tenant_id, config }; - let path = format!( + let req_body = TenantLocationConfigRequest { config }; + + let mut path = reqwest::Url::parse(&format!( "{}/v1/tenant/{}/location_config", - self.mgmt_api_endpoint, tenant_id - ); - let path = if let Some(flush_ms) = flush_ms { - format!("{}?flush_ms={}", path, flush_ms.as_millis()) - } else { - path - }; - self.request(Method::PUT, &path, &req_body).await?; + self.mgmt_api_endpoint, tenant_shard_id + )) + // Should always work: mgmt_api_endpoint is configuration, not user input. + .expect("Cannot build URL"); + + if lazy { + path.query_pairs_mut().append_pair("lazy", "true"); + } + + if let Some(flush_ms) = flush_ms { + path.query_pairs_mut() + .append_pair("flush_ms", &format!("{}", flush_ms.as_millis())); + } + + self.request(Method::PUT, path, &req_body).await?; Ok(()) } + pub async fn list_location_config(&self) -> Result { + let path = format!("{}/v1/location_config", self.mgmt_api_endpoint); + self.request(Method::GET, &path, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn get_location_config( + &self, + tenant_shard_id: TenantShardId, + ) -> Result> { + let path = format!( + "{}/v1/location_config/{tenant_shard_id}", + self.mgmt_api_endpoint + ); + self.request(Method::GET, &path, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn timeline_create( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, req: &TimelineCreateRequest, ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline", - self.mgmt_api_endpoint, tenant_id + self.mgmt_api_endpoint, tenant_shard_id ); self.request(Method::POST, &uri, req) .await? @@ -211,4 +384,238 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + /// The timeline deletion API can return 201 if deletion is incomplete, or + /// 403 if it is complete. Callers are responsible for checking the status + /// code and retrying. Error codes other than 403 will return Err(). + pub async fn timeline_delete( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", + self.mgmt_api_endpoint + ); + + match self.request(Method::DELETE, &uri, ()).await { + Err(Error::ApiError(status_code, msg)) => { + if status_code == StatusCode::NOT_FOUND { + Ok(StatusCode::NOT_FOUND) + } else { + Err(Error::ApiError(status_code, msg)) + } + } + Err(e) => Err(e), + Ok(response) => Ok(response.status()), + } + } + + pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{}/reset", + self.mgmt_api_endpoint, tenant_shard_id + ); + self.request(Method::POST, &uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn tenant_shard_split( + &self, + tenant_shard_id: TenantShardId, + req: TenantShardSplitRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/shard_split", + self.mgmt_api_endpoint, tenant_shard_id + ); + self.request(Method::PUT, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn timeline_list( + &self, + tenant_shard_id: &TenantShardId, + ) -> Result> { + let uri = format!( + "{}/v1/tenant/{}/timeline", + self.mgmt_api_endpoint, tenant_shard_id + ); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn tenant_synthetic_size( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/synthetic_size", + self.mgmt_api_endpoint, tenant_shard_id + ); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn put_io_engine( + &self, + engine: &pageserver_api::models::virtual_file::IoEngineKind, + ) -> Result<()> { + let uri = format!("{}/v1/io_engine", self.mgmt_api_endpoint); + self.request(Method::PUT, uri, engine) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn get_utilization(&self) -> Result { + let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); + self.get(uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn top_tenant_shards( + &self, + request: TopTenantShardsRequest, + ) -> Result { + let uri = format!("{}/v1/top_tenants", self.mgmt_api_endpoint); + self.request(Method::POST, uri, request) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn layer_map_info( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/layer", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id, + ); + self.get(&uri) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn layer_evict( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + layer_file_name: &str, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/layer/{}", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name + ); + let resp = self.request_noerror(Method::DELETE, &uri, ()).await?; + match resp.status() { + StatusCode::OK => Ok(true), + StatusCode::NOT_MODIFIED => Ok(false), + // TODO: dedupe this pattern / introduce separate error variant? + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn layer_ondemand_download( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + layer_file_name: &str, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/layer/{}", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name + ); + let resp = self.request_noerror(Method::GET, &uri, ()).await?; + match resp.status() { + StatusCode::OK => Ok(true), + StatusCode::NOT_MODIFIED => Ok(false), + // TODO: dedupe this pattern / introduce separate error variant? + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn ingest_aux_files( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + aux_files: HashMap, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/ingest_aux_files", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + ); + let resp = self + .request_noerror(Method::POST, &uri, IngestAuxFilesRequest { aux_files }) + .await?; + match resp.status() { + StatusCode::OK => Ok(true), + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } + + pub async fn list_aux_files( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result> { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/list_aux_files", + self.mgmt_api_endpoint, tenant_shard_id, timeline_id + ); + let resp = self + .request_noerror(Method::POST, &uri, ListAuxFilesRequest { lsn }) + .await?; + match resp.status() { + StatusCode::OK => { + let resp: HashMap = resp.json().await.map_err(|e| { + Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")) + })?; + Ok(resp) + } + status => Err(match resp.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri)) + } + }), + } + } } diff --git a/pageserver/client/src/mgmt_api/util.rs b/pageserver/client/src/mgmt_api/util.rs index 048a3bb7cd..bd85506d10 100644 --- a/pageserver/client/src/mgmt_api/util.rs +++ b/pageserver/client/src/mgmt_api/util.rs @@ -2,6 +2,7 @@ use std::sync::Arc; +use pageserver_api::shard::TenantShardId; use tokio::task::JoinSet; use utils::id::{TenantId, TenantTimelineId}; @@ -31,7 +32,10 @@ pub async fn get_pageserver_tenant_timelines_unsharded( async move { ( tenant_id, - mgmt_api_client.tenant_details(tenant_id).await.unwrap(), + mgmt_api_client + .tenant_details(TenantShardId::unsharded(tenant_id)) + .await + .unwrap(), ) } }); diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs index 231461267a..f9507fc47a 100644 --- a/pageserver/client/src/page_service.rs +++ b/pageserver/client/src/page_service.rs @@ -60,7 +60,7 @@ impl Client { ) -> anyhow::Result { let copy_both: tokio_postgres::CopyBothDuplex = self .client - .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}")) + .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}")) .await?; let Client { cancel_on_client_drop, @@ -108,9 +108,32 @@ pub struct RelTagBlockNo { } impl PagestreamClient { - pub async fn shutdown(mut self) { - let _ = self.cancel_on_client_drop.take(); - self.conn_task.await.unwrap(); + pub async fn shutdown(self) { + let Self { + copy_both, + cancel_on_client_drop: cancel_conn_task, + conn_task, + } = self; + // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`. + // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection. + // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56). + // + // If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`, + // the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race). + // + // Further, the pageserver makes a lot of noise when it receives CopyFail. + // Computes don't send it in practice, they just hard-close the connection. + // + // So, let's behave like the computes and suppress the CopyFail as follows: + // kill the socket first, then drop copy_both. + // + // See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY + // + // NB: page_service doesn't have a use case to exit the `pagestream` mode currently. + // => https://github.com/neondatabase/neon/issues/6390 + let _ = cancel_conn_task.unwrap(); + conn_task.await.unwrap(); + drop(copy_both); } pub async fn getpage( @@ -133,7 +156,8 @@ impl PagestreamClient { PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e), PagestreamBeMessage::Exists(_) | PagestreamBeMessage::Nblocks(_) - | PagestreamBeMessage::DbSize(_) => { + | PagestreamBeMessage::DbSize(_) + | PagestreamBeMessage::GetSlruSegment(_) => { anyhow::bail!( "unexpected be message kind in response to getpage request: {}", msg.kind() diff --git a/pageserver/compaction/Cargo.toml b/pageserver/compaction/Cargo.toml new file mode 100644 index 0000000000..0fd1d81845 --- /dev/null +++ b/pageserver/compaction/Cargo.toml @@ -0,0 +1,53 @@ +[package] +name = "pageserver_compaction" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[features] +default = [] + +[dependencies] +anyhow.workspace = true +async-compression.workspace = true +async-stream.workspace = true +byteorder.workspace = true +bytes.workspace = true +chrono = { workspace = true, features = ["serde"] } +clap = { workspace = true, features = ["string"] } +const_format.workspace = true +consumption_metrics.workspace = true +crossbeam-utils.workspace = true +either.workspace = true +flate2.workspace = true +fail.workspace = true +futures.workspace = true +git-version.workspace = true +hex.workspace = true +humantime.workspace = true +humantime-serde.workspace = true +itertools.workspace = true +once_cell.workspace = true +pageserver_api.workspace = true +pin-project-lite.workspace = true +rand.workspace = true +smallvec = { workspace = true, features = ["write"] } +svg_fmt.workspace = true +sync_wrapper.workspace = true +thiserror.workspace = true +tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } +tokio-io-timeout.workspace = true +tokio-util.workspace = true +tracing.workspace = true +tracing-error.workspace = true +tracing-subscriber.workspace = true +url.workspace = true +walkdir.workspace = true +metrics.workspace = true +utils.workspace = true +workspace_hack.workspace = true + +[dev-dependencies] +criterion.workspace = true +hex-literal.workspace = true +tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } diff --git a/pageserver/compaction/TODO.md b/pageserver/compaction/TODO.md new file mode 100644 index 0000000000..85523ad5b3 --- /dev/null +++ b/pageserver/compaction/TODO.md @@ -0,0 +1,51 @@ +# TODO + +- If the key space can be perfectly partitioned at some key, perform planning on each + partition separately. For example, if we are compacting a level with layers like this: + + ``` + : + +--+ +----+ : +------+ + | | | | : | | + +--+ +----+ : +------+ + : + +-----+ +-+ : +--------+ + | | | | : | | + +-----+ +-+ : +--------+ + : + ``` + + At the dotted line, there is a natural split in the key space, such that all + layers are either on the left or the right of it. We can compact the + partitions separately. We could choose to create image layers for one + partition but not the other one, for example. + +- All the layers don't have to be exactly the same size, we can choose to cut a + layer short or stretch it a little larger than the target size, if it helps + the overall system. We can help perfect partitions (see previous bullet point) + to happen more frequently, by choosing the cut points wisely. For example, try + to cut layers at boundaries of underlying image layers. And "snap to grid", + i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0. + +- Avoid rewriting layers when we'd just create an identical layer to an input + layer. + +- Parallelism. The code is already split up into planning and execution, so that + we first split up the compaction work into "Jobs", and then execute them. + It would be straightforward to execute multiple jobs in parallel. + +- Materialize extra pages in delta layers during compaction. This would reduce + read amplification. There has been the idea of partial image layers. Materializing + extra pages in the delta layers achieve the same goal, without introducing a new + concept. + +## Simulator + +- Expand the simulator for more workloads +- Automate a test suite that runs the simluator with different workloads and + spits out a table of results +- Model read amplification +- More sanity checking. One idea is to keep a reference count of each + MockRecord, i.e. use Arc instead of plain MockRecord, and panic if + a MockRecord that is newer than PITR horizon is completely dropped. That would + indicate that the record was lost. diff --git a/pageserver/compaction/src/bin/compaction-simulator.rs b/pageserver/compaction/src/bin/compaction-simulator.rs new file mode 100644 index 0000000000..c308694ae1 --- /dev/null +++ b/pageserver/compaction/src/bin/compaction-simulator.rs @@ -0,0 +1,215 @@ +use clap::{Parser, Subcommand}; +use pageserver_compaction::helpers::PAGE_SZ; +use pageserver_compaction::simulator::MockTimeline; +use rand::Rng; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; + +use utils::project_git_version; + +project_git_version!(GIT_VERSION); + +#[derive(Parser)] +#[command( + version = GIT_VERSION, + about = "Neon Pageserver compaction simulator", + long_about = "A developer tool to visualize and test compaction" +)] +#[command(propagate_version = true)] +struct CliOpts { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + RunSuite, + Simulate(SimulateCmd), +} + +#[derive(Clone, clap::ValueEnum)] +enum Distribution { + Uniform, + HotCold, +} + +/// Read and update pageserver metadata file +#[derive(Parser)] +struct SimulateCmd { + distribution: Distribution, + + /// Number of records to digest + num_records: u64, + /// Record length + record_len: u64, + + // Logical database size in MB + logical_size: u64, +} + +async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> { + let mut executor = MockTimeline::new(); + + // Convert the logical size in MB into a key range. + let key_range = 0..((cmd.logical_size * 1024 * 1024) / PAGE_SZ); + //let key_range = u64::MIN..u64::MAX; + println!( + "starting simulation with key range {:016X}-{:016X}", + key_range.start, key_range.end + ); + + // helper function to print progress indicator + let print_progress = |i| -> anyhow::Result<()> { + if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 { + print!( + "\ringested {} / {} records, {} MiB / {} MiB...", + i + 1, + cmd.num_records, + (i + 1) * cmd.record_len / (1_000_000), + cmd.num_records * cmd.record_len / (1_000_000), + ); + std::io::stdout().flush()?; + } + Ok(()) + }; + + match cmd.distribution { + Distribution::Uniform => { + for i in 0..cmd.num_records { + executor.ingest_uniform(1, cmd.record_len, &key_range)?; + executor.compact_if_needed().await?; + + print_progress(i)?; + } + } + Distribution::HotCold => { + let splitpoint = key_range.start + (key_range.end - key_range.start) / 10; + let hot_key_range = 0..splitpoint; + let cold_key_range = splitpoint..key_range.end; + + for i in 0..cmd.num_records { + let chosen_range = if rand::thread_rng().gen_bool(0.9) { + &hot_key_range + } else { + &cold_key_range + }; + executor.ingest_uniform(1, cmd.record_len, chosen_range)?; + executor.compact_if_needed().await?; + + print_progress(i)?; + } + } + } + println!("done!"); + executor.flush_l0(); + executor.compact_if_needed().await?; + let stats = executor.stats()?; + + // Print the stats to stdout, and also to a file + print!("{stats}"); + std::fs::write(results_path.join("stats.txt"), stats)?; + + let animation_path = results_path.join("compaction-animation.html"); + executor.draw_history(std::fs::File::create(&animation_path)?)?; + println!( + "animation: file://{}", + animation_path.canonicalize()?.display() + ); + + Ok(()) +} + +async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> { + std::fs::create_dir(results_path)?; + + set_log_file(File::create(results_path.join("log"))?); + let result = simulate(workload, results_path).await; + set_log_stdout(); + result +} + +async fn run_suite() -> anyhow::Result<()> { + let top_results_path = PathBuf::from(format!( + "compaction-suite-results.{}", + std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs() + )); + std::fs::create_dir(&top_results_path)?; + + let workload = SimulateCmd { + distribution: Distribution::Uniform, + // Generate 20 GB of WAL + record_len: 1_000, + num_records: 20_000_000, + // Logical size 5 GB + logical_size: 5_000, + }; + + run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?; + + println!( + "All tests finished. Results in {}", + top_results_path.display() + ); + Ok(()) +} + +use std::fs::File; +use std::io::Stdout; +use std::sync::Mutex; +use tracing_subscriber::fmt::writer::EitherWriter; +use tracing_subscriber::fmt::MakeWriter; + +static LOG_FILE: OnceLock>> = OnceLock::new(); +fn get_log_output() -> &'static Mutex> { + LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout()))) +} + +fn set_log_file(f: File) { + *get_log_output().lock().unwrap() = EitherWriter::A(f); +} + +fn set_log_stdout() { + *get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout()); +} + +fn init_logging() -> anyhow::Result<()> { + // We fall back to printing all spans at info-level or above if + // the RUST_LOG environment variable is not set. + let rust_log_env_filter = || { + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")) + }; + + // NB: the order of the with() calls does not matter. + // See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering + use tracing_subscriber::prelude::*; + tracing_subscriber::registry() + .with({ + let log_layer = tracing_subscriber::fmt::layer() + .with_target(false) + .with_ansi(false) + .with_writer(|| get_log_output().make_writer()); + log_layer.with_filter(rust_log_env_filter()) + }) + .init(); + + Ok(()) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = CliOpts::parse(); + + init_logging()?; + + match cli.command { + Commands::Simulate(cmd) => { + simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?; + } + Commands::RunSuite => { + run_suite().await?; + } + }; + Ok(()) +} diff --git a/pageserver/compaction/src/compact_tiered.rs b/pageserver/compaction/src/compact_tiered.rs new file mode 100644 index 0000000000..20f88868f9 --- /dev/null +++ b/pageserver/compaction/src/compact_tiered.rs @@ -0,0 +1,940 @@ +//! # Tiered compaction algorithm. +//! +//! Read all the input delta files, and write a new set of delta files that +//! include all the input WAL records. See retile_deltas(). +//! +//! In a "normal" LSM tree, you get to remove any values that are overwritten by +//! later values, but in our system, we keep all the history. So the reshuffling +//! doesn't remove any garbage, it just reshuffles the records to reduce read +//! amplification, i.e. the number of files that you need to access to find the +//! WAL records for a given key. +//! +//! If the new delta files would be very "narrow", i.e. each file would cover +//! only a narrow key range, then we create a new set of image files +//! instead. The current threshold is that if the estimated total size of the +//! image layers is smaller than the size of the deltas, then we create image +//! layers. That amounts to 2x storage amplification, and it means that the +//! distance of image layers in LSN dimension is roughly equal to the logical +//! database size. For example, if the logical database size is 10 GB, we would +//! generate new image layers every 10 GB of WAL. +use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; +use tracing::{debug, info}; + +use std::collections::{HashSet, VecDeque}; +use std::ops::Range; + +use crate::helpers::{ + accum_key_values, keyspace_total_size, merge_delta_keys_buffered, overlaps_with, PAGE_SZ, +}; +use crate::interface::*; +use utils::lsn::Lsn; + +use crate::identify_levels::identify_level; + +/// Main entry point to compaction. +/// +/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on +/// everything below that point, that needs compaction. The cutoff LSN must +/// partition the layers so that there are no layers that span across that +/// LSN. To start compaction at the top of the tree, pass the end LSN of the +/// written last L0 layer. +pub async fn compact_tiered( + executor: &mut E, + end_lsn: Lsn, + target_file_size: u64, + fanout: u64, + ctx: &E::RequestContext, +) -> anyhow::Result<()> { + assert!(fanout >= 1, "fanout needs to be at least 1 but is {fanout}"); + let exp_base = fanout.max(2); + // Start at L0 + let mut current_level_no = 0; + let mut current_level_target_height = target_file_size; + loop { + // end LSN +1 to include possible image layers exactly at 'end_lsn'. + let all_layers = executor + .get_layers( + &(E::Key::MIN..E::Key::MAX), + &(Lsn(u64::MIN)..end_lsn + 1), + ctx, + ) + .await?; + info!( + "Compacting L{}, total # of layers: {}", + current_level_no, + all_layers.len() + ); + + // Identify the range of LSNs that belong to this level. We assume that + // each file in this level spans an LSN range up to 1.75x target file + // size. That should give us enough slop that if we created a slightly + // oversized L0 layer, e.g. because flushing the in-memory layer was + // delayed for some reason, we don't consider the oversized layer to + // belong to L1. But not too much slop, that we don't accidentally + // "skip" levels. + let max_height = (current_level_target_height as f64 * 1.75) as u64; + let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else { + break; + }; + + // Calculate the height of this level. If the # of tiers exceeds the + // fanout parameter, it's time to compact it. + let depth = level.depth(); + info!( + "Level {} identified as LSN range {}-{}: depth {}", + current_level_no, level.lsn_range.start, level.lsn_range.end, depth + ); + for l in &level.layers { + debug!("LEVEL {} layer: {}", current_level_no, l.short_id()); + } + if depth < fanout { + debug!( + level = current_level_no, + depth = depth, + fanout, + "too few deltas to compact" + ); + break; + } + + compact_level( + &level.lsn_range, + &level.layers, + executor, + target_file_size, + ctx, + ) + .await?; + if current_level_target_height == u64::MAX { + // our target height includes all possible lsns + info!( + level = current_level_no, + depth = depth, + "compaction loop reached max current_level_target_height" + ); + break; + } + current_level_no += 1; + current_level_target_height = current_level_target_height.saturating_mul(exp_base); + } + Ok(()) +} + +async fn compact_level( + lsn_range: &Range, + layers: &[E::Layer], + executor: &mut E, + target_file_size: u64, + ctx: &E::RequestContext, +) -> anyhow::Result { + let mut layer_fragments = Vec::new(); + for l in layers { + layer_fragments.push(LayerFragment::new(l.clone())); + } + + let mut state = LevelCompactionState { + shard_identity: *executor.get_shard_identity(), + target_file_size, + _lsn_range: lsn_range.clone(), + layers: layer_fragments, + jobs: Vec::new(), + job_queue: Vec::new(), + next_level: false, + executor, + }; + + let first_job = CompactionJob { + key_range: E::Key::MIN..E::Key::MAX, + lsn_range: lsn_range.clone(), + strategy: CompactionStrategy::Divide, + input_layers: state + .layers + .iter() + .enumerate() + .map(|i| LayerId(i.0)) + .collect(), + completed: false, + }; + + state.jobs.push(first_job); + state.job_queue.push(JobId(0)); + state.execute(ctx).await?; + + info!( + "compaction completed! Need to process next level: {}", + state.next_level + ); + + Ok(state.next_level) +} + +/// Blackboard that keeps track of the state of all the jobs and work remaining +struct LevelCompactionState<'a, E> +where + E: CompactionJobExecutor, +{ + shard_identity: ShardIdentity, + + // parameters + target_file_size: u64, + + _lsn_range: Range, + layers: Vec>, + + // job queue + jobs: Vec>, + job_queue: Vec, + + /// If false, no need to compact levels below this + next_level: bool, + + /// Interface to the outside world + executor: &'a mut E, +} + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +struct LayerId(usize); +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] +struct JobId(usize); + +struct PendingJobSet { + pending: HashSet, + completed: HashSet, +} + +impl PendingJobSet { + fn new() -> Self { + PendingJobSet { + pending: HashSet::new(), + completed: HashSet::new(), + } + } + + fn complete_job(&mut self, job_id: JobId) { + self.pending.remove(&job_id); + self.completed.insert(job_id); + } + + fn all_completed(&self) -> bool { + self.pending.is_empty() + } +} + +// When we decide to rewrite a set of layers, LayerFragment is used to keep +// track which new layers supersede an old layer. When all the stakeholder jobs +// have completed, this layer can be deleted. +struct LayerFragment +where + E: CompactionJobExecutor, +{ + layer: E::Layer, + + // If we will write new layers to replace this one, this keeps track of the + // jobs that need to complete before this layer can be deleted. As the jobs + // complete, they are moved from 'pending' to 'completed' set. Once the + // 'pending' set becomes empty, the layer can be deleted. + // + // If None, this layer is not rewritten and must not be deleted. + deletable_after: Option, + + deleted: bool, +} + +impl LayerFragment +where + E: CompactionJobExecutor, +{ + fn new(layer: E::Layer) -> Self { + LayerFragment { + layer, + deletable_after: None, + deleted: false, + } + } +} + +#[derive(PartialEq)] +enum CompactionStrategy { + Divide, + CreateDelta, + CreateImage, +} + +struct CompactionJob { + key_range: Range, + lsn_range: Range, + + strategy: CompactionStrategy, + + input_layers: Vec, + + completed: bool, +} + +impl<'a, E> LevelCompactionState<'a, E> +where + E: CompactionJobExecutor, +{ + /// Main loop of the executor. + /// + /// In each iteration, we take the next job from the queue, and execute it. + /// The execution might add new jobs to the queue. Keep going until the + /// queue is empty. + /// + /// Initially, the job queue consists of one Divide job over the whole + /// level. On first call, it is divided into smaller jobs. + async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> { + // TODO: this would be pretty straightforward to parallelize with FuturesUnordered + while let Some(next_job_id) = self.job_queue.pop() { + info!("executing job {}", next_job_id.0); + self.execute_job(next_job_id, ctx).await?; + } + + // all done! + Ok(()) + } + + async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + match job.strategy { + CompactionStrategy::Divide => { + self.divide_job(job_id, ctx).await?; + Ok(()) + } + CompactionStrategy::CreateDelta => { + let mut deltas: Vec = Vec::new(); + let mut layer_ids: Vec = Vec::new(); + for layer_id in &job.input_layers { + let layer = &self.layers[layer_id.0].layer; + if let Some(dl) = self.executor.downcast_delta_layer(layer).await? { + deltas.push(dl.clone()); + layer_ids.push(*layer_id); + } + } + + self.executor + .create_delta(&job.lsn_range, &job.key_range, &deltas, ctx) + .await?; + self.jobs[job_id.0].completed = true; + + // did we complete any fragments? + for layer_id in layer_ids { + let l = &mut self.layers[layer_id.0]; + if let Some(deletable_after) = l.deletable_after.as_mut() { + deletable_after.complete_job(job_id); + if deletable_after.all_completed() { + self.executor.delete_layer(&l.layer, ctx).await?; + l.deleted = true; + } + } + } + + self.next_level = true; + + Ok(()) + } + CompactionStrategy::CreateImage => { + self.executor + .create_image(job.lsn_range.end, &job.key_range, ctx) + .await?; + self.jobs[job_id.0].completed = true; + + // TODO: we could check if any layers < PITR horizon became deletable + Ok(()) + } + } + } + + fn push_job(&mut self, job: CompactionJob) -> JobId { + let job_id = JobId(self.jobs.len()); + self.jobs.push(job); + self.job_queue.push(job_id); + job_id + } + + /// Take a partition of the key space, and decide how to compact it. + /// + /// TODO: Currently, this is called exactly once for the level, and we + /// decide whether to create new image layers to cover the whole level, or + /// write a new set of deltas. In the future, this should try to partition + /// the key space, and make the decision separately for each partition. + async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // Check for dummy cases + if job.input_layers.is_empty() { + return Ok(()); + } + + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // Would it be better to create images for this partition? + // Decide based on the average density of the level + let keyspace_size = keyspace_total_size( + &self + .executor + .get_keyspace(&job.key_range, job.lsn_range.end, ctx) + .await?, + &self.shard_identity, + ) * PAGE_SZ; + + let wal_size = job + .input_layers + .iter() + .filter(|layer_id| self.layers[layer_id.0].layer.is_delta()) + .map(|layer_id| self.layers[layer_id.0].layer.file_size()) + .sum::(); + if keyspace_size < wal_size { + // seems worth it + info!( + "covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}", + keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size + ); + self.cover_with_images(job_id, ctx).await + } else { + // do deltas + info!( + "coverage not worth it, keyspace_size {}, wal_size {}", + keyspace_size, wal_size + ); + self.retile_deltas(job_id, ctx).await + } + } + + // LSN + // ^ + // | + // | ###|###|##### + // | +--+-----+--+ +--+-----+--+ + // | | | | | | | | | + // | +--+--+--+--+ +--+--+--+--+ + // | | | | | | | + // | +---+-+-+---+ ==> +---+-+-+---+ + // | | | | | | | | | + // | +---+-+-++--+ +---+-+-++--+ + // | | | | | | | | | + // | +-----+--+--+ +-----+--+--+ + // | + // +--------------> key + // + async fn cover_with_images( + &mut self, + job_id: JobId, + ctx: &E::RequestContext, + ) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // XXX: do we still need the "holes" stuff? + + let mut new_jobs = Vec::new(); + + // Slide a window through the keyspace + let keyspace = self + .executor + .get_keyspace(&job.key_range, job.lsn_range.end, ctx) + .await?; + + let mut window = KeyspaceWindow::new( + E::Key::MIN..E::Key::MAX, + keyspace, + self.target_file_size / PAGE_SZ, + ); + while let Some(key_range) = window.choose_next_image(&self.shard_identity) { + new_jobs.push(CompactionJob:: { + key_range, + lsn_range: job.lsn_range.clone(), + strategy: CompactionStrategy::CreateImage, + input_layers: Vec::new(), // XXX: Is it OK for this to be empty for image layer? + completed: false, + }); + } + + for j in new_jobs.into_iter().rev() { + let _job_id = self.push_job(j); + + // TODO: image layers don't let us delete anything. unless < PITR horizon + //let j = &self.jobs[job_id.0]; + // for layer_id in j.input_layers.iter() { + // self.layers[layer_id.0].pending_stakeholders.insert(job_id); + //} + } + + Ok(()) + } + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through + // the key space, and for each key, check if including the next key to the + // current output layer we're building would cause the layer to become too + // large. If so, dump the current output layer and start new one. It's + // possible that there is a single key with so many page versions that + // storing all of them in a single layer file would be too large. In that + // case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + async fn retile_deltas( + &mut self, + job_id: JobId, + ctx: &E::RequestContext, + ) -> anyhow::Result<()> { + let job = &self.jobs[job_id.0]; + assert!(job.strategy == CompactionStrategy::Divide); + + // Sweep the key space left to right, running an estimate of how much + // disk size and keyspace we have accumulated + // + // Once the disk size reaches the target threshold, stop and think. + // If we have accumulated only a narrow band of keyspace, create an + // image layer. Otherwise write a delta layer. + + // FIXME: we are ignoring images here. Did we already divide the work + // so that we won't encounter them here? + + let mut deltas: Vec = Vec::new(); + for layer_id in &job.input_layers { + let l = &self.layers[layer_id.0]; + if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? { + deltas.push(dl.clone()); + } + } + // Open stream + let key_value_stream = + std::pin::pin!(merge_delta_keys_buffered::(deltas.as_slice(), ctx) + .await? + .map(Result::<_, anyhow::Error>::Ok)); + let mut new_jobs = Vec::new(); + + // Slide a window through the keyspace + let mut key_accum = + std::pin::pin!(accum_key_values(key_value_stream, self.target_file_size)); + let mut all_in_window: bool = false; + let mut window = Window::new(); + + // Helper function to create a job for a new delta layer with given key-lsn + // rectangle. + let create_delta_job = |key_range, lsn_range: &Range, new_jobs: &mut Vec<_>| { + // The inputs for the job are all the input layers of the original job that + // overlap with the rectangle. + let batch_layers: Vec = job + .input_layers + .iter() + .filter(|layer_id| { + overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range) + }) + .cloned() + .collect(); + assert!(!batch_layers.is_empty()); + new_jobs.push(CompactionJob { + key_range, + lsn_range: lsn_range.clone(), + strategy: CompactionStrategy::CreateDelta, + input_layers: batch_layers, + completed: false, + }); + }; + + loop { + if all_in_window && window.is_empty() { + // All done! + break; + } + + // If we now have enough keyspace for next delta layer in the window, create a + // new delta layer + if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window) + { + create_delta_job(key_range, &job.lsn_range, &mut new_jobs); + continue; + } + assert!(!all_in_window); + + // Process next key in the key space + match key_accum.next().await.transpose()? { + None => { + all_in_window = true; + } + Some(next_key) if next_key.partition_lsns.is_empty() => { + // Normal case: extend the window by the key + window.feed(next_key.key, next_key.size); + } + Some(next_key) => { + // A key with too large size impact for a single delta layer. This + // case occurs if you make a huge number of updates for a single key. + // + // Drain the window with has_more = false to make a clean cut before + // the key, and then make dedicated delta layers for the single key. + // + // We cannot cluster the key with the others, because we don't want + // layer files to overlap with each other in the lsn,key space (no + // overlaps for the rectangles). + let key = next_key.key; + debug!("key {key} with size impact larger than the layer size"); + while !window.is_empty() { + let has_more = false; + let key_range = window.choose_next_delta(self.target_file_size, has_more) + .expect("with has_more==false, choose_next_delta always returns something for a non-empty Window"); + create_delta_job(key_range, &job.lsn_range, &mut new_jobs); + } + + // Not really required: but here for future resilience: + // We make a "gap" here, so any structure the window holds should + // probably be reset. + window = Window::new(); + + let mut prior_lsn = job.lsn_range.start; + let mut lsn_ranges = Vec::new(); + for (lsn, _size) in next_key.partition_lsns.iter() { + lsn_ranges.push(prior_lsn..*lsn); + prior_lsn = *lsn; + } + lsn_ranges.push(prior_lsn..job.lsn_range.end); + for lsn_range in lsn_ranges { + let key_range = key..key.next(); + create_delta_job(key_range, &lsn_range, &mut new_jobs); + } + } + } + } + + // All the input files are rewritten. Set up the tracking for when they can + // be deleted. + for layer_id in job.input_layers.iter() { + let l = &mut self.layers[layer_id.0]; + assert!(l.deletable_after.is_none()); + l.deletable_after = Some(PendingJobSet::new()); + } + for j in new_jobs.into_iter().rev() { + let job_id = self.push_job(j); + let j = &self.jobs[job_id.0]; + for layer_id in j.input_layers.iter() { + self.layers[layer_id.0] + .deletable_after + .as_mut() + .unwrap() + .pending + .insert(job_id); + } + } + + Ok(()) + } +} + +/// Sliding window through keyspace and values for image layer +/// This is used by [`LevelCompactionState::cover_with_images`] to decide on good split points +struct KeyspaceWindow { + head: KeyspaceWindowHead, + + start_pos: KeyspaceWindowPos, +} +struct KeyspaceWindowHead { + // overall key range to cover + key_range: Range, + + keyspace: Vec>, + target_keysize: u64, +} + +#[derive(Clone)] +struct KeyspaceWindowPos { + end_key: K, + + keyspace_idx: usize, + + accum_keysize: u64, +} +impl KeyspaceWindowPos { + fn reached_end(&self, w: &KeyspaceWindowHead) -> bool { + self.keyspace_idx == w.keyspace.len() + } + + // Advance the cursor until it reaches 'target_keysize'. + fn advance_until_size( + &mut self, + w: &KeyspaceWindowHead, + max_size: u64, + shard_identity: &ShardIdentity, + ) { + while self.accum_keysize < max_size && !self.reached_end(w) { + let curr_range = &w.keyspace[self.keyspace_idx]; + if self.end_key < curr_range.start { + // skip over any unused space + self.end_key = curr_range.start; + } + + // We're now within 'curr_range'. Can we advance past it completely? + let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity); + if (self.accum_keysize + distance as u64) < max_size { + // oh yeah, it fits + self.end_key = curr_range.end; + self.keyspace_idx += 1; + self.accum_keysize += distance as u64; + } else { + // advance within the range + let skip_key = self.end_key.skip_some(); + let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity); + if (self.accum_keysize + distance as u64) < max_size { + self.end_key = skip_key; + self.accum_keysize += distance as u64; + } else { + self.end_key = self.end_key.next(); + self.accum_keysize += 1; + } + } + } + } +} + +impl KeyspaceWindow +where + K: CompactionKey, +{ + fn new(key_range: Range, keyspace: CompactionKeySpace, target_keysize: u64) -> Self { + assert!(keyspace.first().unwrap().start >= key_range.start); + + let start_key = key_range.start; + let start_pos = KeyspaceWindowPos:: { + end_key: start_key, + keyspace_idx: 0, + accum_keysize: 0, + }; + Self { + head: KeyspaceWindowHead:: { + key_range, + keyspace, + target_keysize, + }, + start_pos, + } + } + + fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option> { + if self.start_pos.keyspace_idx == self.head.keyspace.len() { + // we've reached the end + return None; + } + + let mut next_pos = self.start_pos.clone(); + next_pos.advance_until_size( + &self.head, + self.start_pos.accum_keysize + self.head.target_keysize, + shard_identity, + ); + + // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to + // 1.25x target size + let mut end_pos = next_pos.clone(); + end_pos.advance_until_size( + &self.head, + self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4), + shard_identity, + ); + if end_pos.reached_end(&self.head) { + // gobble up any unused keyspace between the last used key and end of the range + assert!(end_pos.end_key <= self.head.key_range.end); + end_pos.end_key = self.head.key_range.end; + next_pos = end_pos; + } + + let start_key = self.start_pos.end_key; + self.start_pos = next_pos; + Some(start_key..self.start_pos.end_key) + } +} + +// Take previous partitioning, based on the image layers below. +// +// Candidate is at the front: +// +// Consider stretching an image layer to next divider? If it's close enough, +// that's the image candidate +// +// If it's too far, consider splitting at a reasonable point +// +// Is the image candidate smaller than the equivalent delta? If so, +// split off the image. Otherwise, split off one delta. +// Try to snap off the delta at a reasonable point + +struct WindowElement { + start_key: K, // inclusive + last_key: K, // inclusive + accum_size: u64, +} + +/// Sliding window through keyspace and values for delta layer tiling +/// +/// This is used to decide which delta layer to write next. +struct Window { + elems: VecDeque>, + + // last key that was split off, inclusive + splitoff_key: Option, + splitoff_size: u64, +} + +impl Window +where + K: CompactionKey, +{ + fn new() -> Self { + Self { + elems: VecDeque::new(), + splitoff_key: None, + splitoff_size: 0, + } + } + + fn feed(&mut self, key: K, size: u64) { + let last_size; + if let Some(last) = self.elems.back_mut() { + // We require the keys to be strictly increasing for the window. + // Keys should already have been deduplicated by `accum_key_values` + assert!( + last.last_key < key, + "last_key(={}) >= key(={key})", + last.last_key + ); + last_size = last.accum_size; + } else { + last_size = 0; + } + // This is a new key. + let elem = WindowElement { + start_key: key, + last_key: key, + accum_size: last_size + size, + }; + self.elems.push_back(elem); + } + + fn remain_size(&self) -> u64 { + self.elems.back().unwrap().accum_size - self.splitoff_size + } + + fn peek_size(&self) -> u64 { + self.elems.front().unwrap().accum_size - self.splitoff_size + } + + fn is_empty(&self) -> bool { + self.elems.is_empty() + } + + fn commit_upto(&mut self, mut upto: usize) { + while upto > 1 { + let popped = self.elems.pop_front().unwrap(); + self.elems.front_mut().unwrap().start_key = popped.start_key; + upto -= 1; + } + } + + fn find_size_split(&self, target_size: u64) -> usize { + self.elems + .partition_point(|elem| elem.accum_size - self.splitoff_size < target_size) + } + + fn pop(&mut self) { + let first = self.elems.pop_front().unwrap(); + self.splitoff_size = first.accum_size; + + self.splitoff_key = Some(first.last_key); + } + + // the difference between delta and image is that an image covers + // any unused keyspace before and after, while a delta tries to + // minimize that. TODO: difference not implemented + fn pop_delta(&mut self) -> Range { + let first = self.elems.front().unwrap(); + let key_range = first.start_key..first.last_key.next(); + + self.pop(); + key_range + } + + // Prerequisite: we have enough input in the window + // + // On return None, the caller should feed more data and call again + fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option> { + if has_more && self.elems.is_empty() { + // Starting up + return None; + } + + // If we still have an undersized candidate, just keep going + while self.peek_size() < target_size { + if self.elems.len() > 1 { + self.commit_upto(2); + } else if has_more { + return None; + } else { + break; + } + } + + // Ensure we have enough input in the window to make a good decision + if has_more && self.remain_size() < target_size * 5 / 4 { + return None; + } + + // The candidate on the front is now large enough, for a delta. + // And we have enough data in the window to decide. + + // If we're willing to stretch it up to 1.25 target size, could we + // gobble up the rest of the work? This avoids creating very small + // "tail" layers at the end of the keyspace + if !has_more && self.remain_size() < target_size * 5 / 4 { + self.commit_upto(self.elems.len()); + } else { + let delta_split_at = self.find_size_split(target_size); + self.commit_upto(delta_split_at); + + // If it's still not large enough, request the caller to fill the window + if self.elems.len() == 1 && has_more { + return None; + } + } + Some(self.pop_delta()) + } +} diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs new file mode 100644 index 0000000000..8ed1d16082 --- /dev/null +++ b/pageserver/compaction/src/helpers.rs @@ -0,0 +1,295 @@ +//! This file contains generic utility functions over the interface types, +//! which could be handy for any compaction implementation. +use crate::interface::*; + +use futures::future::BoxFuture; +use futures::{Stream, StreamExt}; +use itertools::Itertools; +use pageserver_api::shard::ShardIdentity; +use pin_project_lite::pin_project; +use std::collections::BinaryHeap; +use std::collections::VecDeque; +use std::fmt::Display; +use std::future::Future; +use std::ops::{DerefMut, Range}; +use std::pin::Pin; +use std::task::{ready, Poll}; +use utils::lsn::Lsn; + +pub const PAGE_SZ: u64 = 8192; + +pub fn keyspace_total_size( + keyspace: &CompactionKeySpace, + shard_identity: &ShardIdentity, +) -> u64 +where + K: CompactionKey, +{ + keyspace + .iter() + .map(|r| K::key_range_size(r, shard_identity) as u64) + .sum() +} + +pub fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) +} + +pub fn union_to_keyspace(a: &mut CompactionKeySpace, b: CompactionKeySpace) { + let x = std::mem::take(a); + let mut all_ranges_iter = [x.into_iter(), b.into_iter()] + .into_iter() + .kmerge_by(|a, b| a.start < b.start); + let mut ranges = Vec::new(); + if let Some(first) = all_ranges_iter.next() { + let (mut start, mut end) = (first.start, first.end); + + for r in all_ranges_iter { + assert!(r.start >= start); + if r.start > end { + ranges.push(start..end); + start = r.start; + end = r.end; + } else if r.end > end { + end = r.end; + } + } + ranges.push(start..end); + } + *a = ranges +} + +pub fn intersect_keyspace( + a: &CompactionKeySpace, + r: &Range, +) -> CompactionKeySpace { + let mut ranges: Vec> = Vec::new(); + + for x in a.iter() { + if x.end <= r.start { + continue; + } + if x.start >= r.end { + break; + } + ranges.push(x.clone()) + } + + // trim the ends + if let Some(first) = ranges.first_mut() { + first.start = std::cmp::max(first.start, r.start); + } + if let Some(last) = ranges.last_mut() { + last.end = std::cmp::min(last.end, r.end); + } + ranges +} + +/// Create a stream that iterates through all DeltaEntrys among all input +/// layers, in key-lsn order. +/// +/// This is public because the create_delta() implementation likely wants to use this too +/// TODO: move to a more shared place +pub fn merge_delta_keys<'a, E: CompactionJobExecutor>( + layers: &'a [E::DeltaLayer], + ctx: &'a E::RequestContext, +) -> MergeDeltaKeys<'a, E> { + // Use a binary heap to merge the layers. Each input layer is initially + // represented by a LazyLoadLayer::Unloaded element, which uses the start of + // the layer's key range as the key. The first time a layer reaches the top + // of the heap, all the keys of the layer are loaded into a sorted vector. + // + // This helps to keep the memory usage reasonable: we only need to hold in + // memory the DeltaEntrys of the layers that overlap with the "current" key. + let mut heap: BinaryHeap> = BinaryHeap::new(); + for l in layers { + heap.push(LazyLoadLayer::Unloaded(l)); + } + MergeDeltaKeys { + heap, + ctx, + load_future: None, + } +} + +pub async fn merge_delta_keys_buffered<'a, E: CompactionJobExecutor + 'a>( + layers: &'a [E::DeltaLayer], + ctx: &'a E::RequestContext, +) -> anyhow::Result>::DeltaEntry<'a>>> +{ + let mut keys = Vec::new(); + for l in layers { + // Boxing and casting to LoadFuture is required to obtain the right Sync bound. + // If we do l.load_keys(ctx).await? directly, there is a compilation error. + let load_future: LoadFuture<'a, _> = Box::pin(l.load_keys(ctx)); + keys.extend(load_future.await?.into_iter()); + } + keys.sort_by_key(|k| (k.key(), k.lsn())); + let stream = futures::stream::iter(keys.into_iter()); + Ok(stream) +} + +enum LazyLoadLayer<'a, E: CompactionJobExecutor> { + Loaded(VecDeque<>::DeltaEntry<'a>>), + Unloaded(&'a E::DeltaLayer), +} +impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> { + fn min_key(&self) -> E::Key { + match self { + Self::Loaded(entries) => entries.front().unwrap().key(), + Self::Unloaded(dl) => dl.key_range().start, + } + } + fn min_lsn(&self) -> Lsn { + match self { + Self::Loaded(entries) => entries.front().unwrap().lsn(), + Self::Unloaded(dl) => dl.lsn_range().start, + } + } +} +impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // reverse order so that we get a min-heap + (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn())) + } +} +impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == std::cmp::Ordering::Equal + } +} +impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {} + +type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result>>; + +// Stream returned by `merge_delta_keys` +pin_project! { +#[allow(clippy::type_complexity)] +pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> { + heap: BinaryHeap>, + + #[pin] + load_future: Option>::DeltaEntry<'a>>>, + + ctx: &'a E::RequestContext, +} +} + +impl<'a, E> Stream for MergeDeltaKeys<'a, E> +where + E: CompactionJobExecutor + 'a, +{ + type Item = anyhow::Result<>::DeltaEntry<'a>>; + + fn poll_next( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll::Item>> { + let mut this = self.project(); + loop { + if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() { + // We are waiting for loading the keys to finish + match ready!(load_future.as_mut().poll(cx)) { + Ok(entries) => { + this.load_future.set(None); + *this.heap.peek_mut().unwrap() = + LazyLoadLayer::Loaded(VecDeque::from(entries)); + } + Err(e) => { + return Poll::Ready(Some(Err(e))); + } + } + } + + // If the topmost layer in the heap hasn't been loaded yet, start + // loading it. Otherwise return the next entry from it and update + // the layer's position in the heap (this decreaseKey operation is + // performed implicitly when `top` is dropped). + if let Some(mut top) = this.heap.peek_mut() { + match top.deref_mut() { + LazyLoadLayer::Unloaded(ref mut l) => { + let fut = l.load_keys(this.ctx); + this.load_future.set(Some(Box::pin(fut))); + continue; + } + LazyLoadLayer::Loaded(ref mut entries) => { + let result = entries.pop_front().unwrap(); + if entries.is_empty() { + std::collections::binary_heap::PeekMut::pop(top); + } + return Poll::Ready(Some(Ok(result))); + } + } + } else { + return Poll::Ready(None); + } + } + } +} + +// Accumulate values at key boundaries +pub struct KeySize { + pub key: K, + pub num_values: u64, + pub size: u64, + /// The lsns to partition at (if empty then no per-lsn partitioning) + pub partition_lsns: Vec<(Lsn, u64)>, +} + +pub fn accum_key_values<'a, I, K, D, E>( + input: I, + target_size: u64, +) -> impl Stream, E>> +where + K: Eq + PartialOrd + Display + Copy, + I: Stream>, + D: CompactionDeltaEntry<'a, K>, +{ + async_stream::try_stream! { + // Initialize the state from the first value + let mut input = std::pin::pin!(input); + + if let Some(first) = input.next().await { + let first = first?; + let mut part_size = first.size(); + let mut accum: KeySize = KeySize { + key: first.key(), + num_values: 1, + size: part_size, + partition_lsns: Vec::new(), + }; + let mut last_key = accum.key; + while let Some(this) = input.next().await { + let this = this?; + if this.key() == accum.key { + let add_size = this.size(); + if part_size + add_size > target_size { + accum.partition_lsns.push((this.lsn(), part_size)); + part_size = 0; + } + part_size += add_size; + accum.size += add_size; + accum.num_values += 1; + } else { + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); + last_key = accum.key; + yield accum; + part_size = this.size(); + accum = KeySize { + key: this.key(), + num_values: 1, + size: part_size, + partition_lsns: Vec::new(), + }; + } + } + assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key); + yield accum; + } + } +} diff --git a/pageserver/compaction/src/identify_levels.rs b/pageserver/compaction/src/identify_levels.rs new file mode 100644 index 0000000000..1853afffdd --- /dev/null +++ b/pageserver/compaction/src/identify_levels.rs @@ -0,0 +1,381 @@ +//! An LSM tree consists of multiple levels, each exponentially larger than the +//! previous level. And each level consists of multiple "tiers". With tiered +//! compaction, a level is compacted when it has accumulated more than N tiers, +//! forming one tier on the next level. +//! +//! In the pageserver, we don't explicitly track the levels and tiers. Instead, +//! we identify them by looking at the shapes of the layers. It's an easy task +//! for a human, but it's not straightforward to come up with the exact +//! rules. Especially if there are cases like interrupted, half-finished +//! compactions, or highly skewed data distributions that have let us "skip" +//! some levels. It's not critical to classify all cases correctly; at worst we +//! delay some compaction work, and suffer from more read amplification, or we +//! perform some unnecessary compaction work. +//! +//! `identify_level` performs that shape-matching. +//! +//! It returns a Level struct, which has `depth()` function to count the number +//! of "tiers" in the level. The tier count is the max depth of stacked layers +//! within the level. That's a good measure, because the point of compacting is +//! to reduce read amplification, and the depth is what determines that. +//! +//! One interesting effect of this is that if we generate very small delta +//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than +//! because they reach the target size, the L0 compaction will combine them to +//! one larger file. But if the combined file is still smaller than the target +//! file size, the file will still be considered to be part of L0 at the next +//! iteration. + +use anyhow::bail; +use std::collections::BTreeSet; +use std::ops::Range; +use utils::lsn::Lsn; + +use crate::interface::*; + +use tracing::{info, trace}; + +pub struct Level { + pub lsn_range: Range, + pub layers: Vec, +} + +/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are +/// no layers that cross the boundary LSN. +/// +/// A further restriction is that all layers in the returned partition cover at +/// most 'lsn_max_size' LSN bytes. +pub async fn identify_level( + all_layers: Vec, + end_lsn: Lsn, + lsn_max_size: u64, +) -> anyhow::Result>> +where + K: CompactionKey, + L: CompactionLayer + Clone, +{ + // filter out layers that are above the `end_lsn`, they are completely irrelevant. + let mut layers = Vec::new(); + for l in all_layers { + if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn { + // shouldn't happen. Indicates that the caller passed a bogus + // end_lsn. + bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id()); + } + // include image layers sitting exacty at `end_lsn`. + let is_image = !l.is_delta(); + if (is_image && l.lsn_range().start > end_lsn) + || (!is_image && l.lsn_range().start >= end_lsn) + { + continue; + } + layers.push(l); + } + // All the remaining layers either belong to this level, or are below it. + info!( + "identify level at {}, size {}, num layers below: {}", + end_lsn, + lsn_max_size, + layers.len() + ); + if layers.is_empty() { + return Ok(None); + } + + // Walk the ranges in LSN order. + // + // ----- end_lsn + // | + // | + // v + // + layers.sort_by_key(|l| l.lsn_range().end); + let mut candidate_start_lsn = end_lsn; + let mut candidate_layers: Vec = Vec::new(); + let mut current_best_start_lsn = end_lsn; + let mut current_best_layers: Vec = Vec::new(); + let mut iter = layers.into_iter(); + loop { + let Some(l) = iter.next_back() else { + // Reached end. Accept the last candidate + current_best_start_lsn = candidate_start_lsn; + current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers)); + break; + }; + trace!( + "inspecting {} for candidate {}, current best {}", + l.short_id(), + candidate_start_lsn, + current_best_start_lsn + ); + + let r = l.lsn_range(); + + // Image layers don't restrict our choice of cutoff LSN + if l.is_delta() { + // Is this candidate workable? In other words, are there any + // delta layers that span across this LSN + // + // Valid: Not valid: + // + + + // | | + + // + <- candidate + | <- candidate + // + + + // | + // + + if r.end <= candidate_start_lsn { + // Hooray, there are no crossing LSNs. And we have visited + // through all the layers within candidate..end_lsn. The + // current candidate can be accepted. + current_best_start_lsn = r.end; + current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers)); + candidate_start_lsn = r.start; + } + + // Is it small enough to be considered part of this level? + if r.end.0 - r.start.0 > lsn_max_size { + // Too large, this layer belongs to next level. Stop. + trace!( + "too large {}, size {} vs {}", + l.short_id(), + r.end.0 - r.start.0, + lsn_max_size + ); + break; + } + + // If this crosses the candidate lsn, push it down. + if r.start < candidate_start_lsn { + trace!( + "layer {} prevents from stopping at {}", + l.short_id(), + candidate_start_lsn + ); + candidate_start_lsn = r.start; + } + } + + // Include this layer in our candidate + candidate_layers.push(l); + } + + Ok(if current_best_start_lsn == end_lsn { + // empty level + None + } else { + Some(Level { + lsn_range: current_best_start_lsn..end_lsn, + layers: current_best_layers, + }) + }) +} + +impl Level { + /// Count the number of deltas stacked on each other. + pub fn depth(&self) -> u64 + where + K: CompactionKey, + L: CompactionLayer, + { + struct Event { + key: K, + layer_idx: usize, + start: bool, + } + let mut events: Vec> = Vec::new(); + for (idx, l) in self.layers.iter().enumerate() { + let key_range = l.key_range(); + if key_range.end == key_range.start.next() && l.is_delta() { + // Ignore single-key delta layers as they can be stacked on top of each other + // as that is the only way to cut further. + continue; + } + events.push(Event { + key: l.key_range().start, + layer_idx: idx, + start: true, + }); + events.push(Event { + key: l.key_range().end, + layer_idx: idx, + start: false, + }); + } + events.sort_by_key(|e| (e.key, e.start)); + + // Sweep the key space left to right. Stop at each distinct key, and + // count the number of deltas on top of the highest image at that key. + // + // This is a little inefficient, as we walk through the active_set on + // every key. We could increment/decrement a counter on each step + // instead, but that'd require a bit more complex bookkeeping. + let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new(); + let mut max_depth = 0; + let mut events_iter = events.iter().peekable(); + while let Some(e) = events_iter.next() { + let l = &self.layers[e.layer_idx]; + let is_image = !l.is_delta(); + + // update the active set + if e.start { + active_set.insert((l.lsn_range().end, is_image, e.layer_idx)); + } else { + active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx)); + } + + // recalculate depth if this was the last event at this point + let more_events_at_this_key = events_iter + .peek() + .map_or(false, |next_e| next_e.key == e.key); + if !more_events_at_this_key { + let mut active_depth = 0; + for (_end_lsn, is_image, _idx) in active_set.iter().rev() { + if *is_image { + break; + } + active_depth += 1; + } + if active_depth > max_depth { + max_depth = active_depth; + } + } + } + debug_assert_eq!(active_set, BTreeSet::new()); + max_depth + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer}; + use std::sync::{Arc, Mutex}; + + fn delta(key_range: Range, lsn_range: Range) -> MockLayer { + MockLayer::Delta(Arc::new(MockDeltaLayer { + key_range, + lsn_range, + // identify_level() doesn't pay attention to the rest of the fields + file_size: 0, + deleted: Mutex::new(false), + records: vec![], + })) + } + + fn image(key_range: Range, lsn: Lsn) -> MockLayer { + MockLayer::Image(Arc::new(MockImageLayer { + key_range, + lsn_range: lsn..(lsn + 1), + // identify_level() doesn't pay attention to the rest of the fields + file_size: 0, + deleted: Mutex::new(false), + })) + } + + #[tokio::test] + async fn test_identify_level() -> anyhow::Result<()> { + let layers = vec![ + delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)), + delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)), + delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)), + delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), + delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), + delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)), + ]; + + // All layers fit in the max file size + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.depth(), 6); + + // Same LSN with smaller max file size. The second layer from the top is larger + // and belongs to next level. + let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000) + .await? + .unwrap(); + assert_eq!(level.depth(), 1); + + // Call with a smaller LSN + let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000) + .await? + .unwrap(); + assert_eq!(level.depth(), 2); + + // Call with an LSN that doesn't partition the space + let result = identify_level(layers, Lsn(0x6000), 0x1000).await; + assert!(result.is_err()); + Ok(()) + } + + #[tokio::test] + async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> { + // The files LSN ranges overlap, so even though there are more files that + // fit under the file size, they are not included in the level because they + // overlap so that we'd need to include the oldest file, too, which is + // larger + let layers = vec![ + delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)), + delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap + delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap + delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap + delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000) + .await? + .unwrap(); + assert_eq!(level.depth(), 1); + + Ok(()) + } + + #[tokio::test] + async fn test_depth_nonoverlapping() -> anyhow::Result<()> { + // The key ranges don't overlap, so depth is only 1. + let layers = vec![ + delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)), + delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)), + delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.layers.len(), 3); + assert_eq!(level.depth(), 1); + + // Staggered. The 1st and 3rd layer don't overlap with each other. + let layers = vec![ + delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), + delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)), + delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)), + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.layers.len(), 3); + assert_eq!(level.depth(), 2); + Ok(()) + } + + #[tokio::test] + async fn test_depth_images() -> anyhow::Result<()> { + let layers: Vec = vec![ + delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)), + delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)), + delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)), + // This covers the same key range as the 2nd delta layer. The depth + // in that key range is therefore 0. + image(1500..2500, Lsn(0x9000)), + ]; + + let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000) + .await? + .unwrap(); + assert_eq!(level.layers.len(), 4); + assert_eq!(level.depth(), 1); + Ok(()) + } +} diff --git a/pageserver/compaction/src/interface.rs b/pageserver/compaction/src/interface.rs new file mode 100644 index 0000000000..35519b5d0a --- /dev/null +++ b/pageserver/compaction/src/interface.rs @@ -0,0 +1,165 @@ +//! This is what the compaction implementation needs to know about +//! layers, keyspace etc. +//! +//! All the heavy lifting is done by the create_image and create_delta +//! functions that the implementor provides. +use futures::Future; +use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity}; +use std::ops::Range; +use utils::lsn::Lsn; + +/// Public interface. This is the main thing that the implementor needs to provide +pub trait CompactionJobExecutor { + // Type system. + // + // We assume that there are two kinds of layers, deltas and images. The + // compaction doesn't distinguish whether they are stored locally or + // remotely. + // + // The keyspace is defined by the CompactionKey trait. + type Key: CompactionKey; + + type Layer: CompactionLayer + Clone; + type DeltaLayer: CompactionDeltaLayer + Clone; + type ImageLayer: CompactionImageLayer + Clone; + + // This is passed through to all the interface functions. The compaction + // implementation doesn't do anything with it, but it might be useful for + // the interface implementation. + type RequestContext: CompactionRequestContext; + + // ---- + // Functions that the planner uses to support its decisions + // ---- + + fn get_shard_identity(&self) -> &ShardIdentity; + + /// Return all layers that overlap the given bounding box. + fn get_layers( + &mut self, + key_range: &Range, + lsn_range: &Range, + ctx: &Self::RequestContext, + ) -> impl Future>> + Send; + + fn get_keyspace( + &mut self, + key_range: &Range, + lsn: Lsn, + ctx: &Self::RequestContext, + ) -> impl Future>> + Send; + + /// NB: This is a pretty expensive operation. In the real pageserver + /// implementation, it downloads the layer, and keeps it resident + /// until the DeltaLayer is dropped. + fn downcast_delta_layer( + &self, + layer: &Self::Layer, + ) -> impl Future>> + Send; + + // ---- + // Functions to execute the plan + // ---- + + /// Create a new image layer, materializing all the values in the key range, + /// at given 'lsn'. + fn create_image( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &Self::RequestContext, + ) -> impl Future> + Send; + + /// Create a new delta layer, containing all the values from 'input_layers' + /// in the given key and LSN range. + fn create_delta( + &mut self, + lsn_range: &Range, + key_range: &Range, + input_layers: &[Self::DeltaLayer], + ctx: &Self::RequestContext, + ) -> impl Future> + Send; + + /// Delete a layer. The compaction implementation will call this only after + /// all the create_image() or create_delta() calls that deletion of this + /// layer depends on have finished. But if the implementor has extra lazy + /// background tasks, like uploading the index json file to remote storage. + /// it is the implementation's responsibility to track those. + fn delete_layer( + &mut self, + layer: &Self::Layer, + ctx: &Self::RequestContext, + ) -> impl Future> + Send; +} + +pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display { + const MIN: Self; + const MAX: Self; + + /// Calculate distance between key_range.start and key_range.end. + /// + /// This returns u32, for compatibility with Repository::key. If the + /// distance is larger, return u32::MAX. + fn key_range_size(key_range: &Range, shard_identity: &ShardIdentity) -> u32; + + // return "self + 1" + fn next(&self) -> Self; + + // return "self + ". The amount to skip + // is left to the implementation. + // FIXME: why not just "add(u32)" ? This is hard to use + fn skip_some(&self) -> Self; +} + +impl CompactionKey for Key { + const MIN: Self = Self::MIN; + const MAX: Self = Self::MAX; + + fn key_range_size(r: &std::ops::Range, shard_identity: &ShardIdentity) -> u32 { + ShardedRange::new(r.clone(), shard_identity).page_count() + } + fn next(&self) -> Key { + (self as &Key).next() + } + fn skip_some(&self) -> Key { + self.add(128) + } +} + +/// Contiguous ranges of keys that belong to the key space. In key order, and +/// with no overlap. +pub type CompactionKeySpace = Vec>; + +/// Functions needed from all layers. +pub trait CompactionLayer { + fn key_range(&self) -> &Range; + fn lsn_range(&self) -> &Range; + + fn file_size(&self) -> u64; + + /// For debugging, short human-readable representation of the layer. E.g. filename. + fn short_id(&self) -> String; + + fn is_delta(&self) -> bool; +} +pub trait CompactionDeltaLayer: CompactionLayer { + type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key> + where + Self: 'a; + + /// Return all keys in this delta layer. + fn load_keys<'a>( + &self, + ctx: &E::RequestContext, + ) -> impl Future>>> + Send; +} + +pub trait CompactionImageLayer: CompactionLayer {} + +pub trait CompactionDeltaEntry<'a, K> { + fn key(&self) -> K; + fn lsn(&self) -> Lsn; + fn size(&self) -> u64; +} + +pub trait CompactionRequestContext {} diff --git a/pageserver/compaction/src/lib.rs b/pageserver/compaction/src/lib.rs new file mode 100644 index 0000000000..2d6d673de5 --- /dev/null +++ b/pageserver/compaction/src/lib.rs @@ -0,0 +1,12 @@ +// The main module implementing the compaction algorithm +pub mod compact_tiered; +pub(crate) mod identify_levels; + +// Traits that the caller of the compaction needs to implement +pub mod interface; + +// Utility functions, useful for the implementation +pub mod helpers; + +// A simulator with mock implementations of 'interface' +pub mod simulator; diff --git a/pageserver/compaction/src/simulator.rs b/pageserver/compaction/src/simulator.rs new file mode 100644 index 0000000000..776c537d03 --- /dev/null +++ b/pageserver/compaction/src/simulator.rs @@ -0,0 +1,617 @@ +mod draw; + +use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp}; + +use futures::StreamExt; +use pageserver_api::shard::ShardIdentity; +use rand::Rng; +use tracing::info; + +use utils::lsn::Lsn; + +use std::fmt::Write; +use std::ops::Range; +use std::sync::Arc; +use std::sync::Mutex; + +use crate::helpers::PAGE_SZ; +use crate::helpers::{merge_delta_keys, overlaps_with}; + +use crate::interface; +use crate::interface::CompactionLayer; + +// +// Implementation for the CompactionExecutor interface +// +pub struct MockTimeline { + // Parameters for the compaction algorithm + pub target_file_size: u64, + tiers_per_level: u64, + + num_l0_flushes: u64, + last_compact_at_flush: u64, + last_flush_lsn: Lsn, + + // In-memory layer + records: Vec, + total_len: u64, + start_lsn: Lsn, + end_lsn: Lsn, + + // Current keyspace at `end_lsn`. This is updated on every ingested record. + keyspace: KeySpace, + + // historic keyspaces + old_keyspaces: Vec<(Lsn, KeySpace)>, + + // "on-disk" layers + pub live_layers: Vec, + + num_deleted_layers: u64, + + // Statistics + wal_ingested: u64, + bytes_written: u64, + bytes_deleted: u64, + layers_created: u64, + layers_deleted: u64, + + // All the events - creation and deletion of files - are collected + // in 'history'. It is used to draw the SVG animation at the end. + time: u64, + history: Vec, +} + +type KeySpace = interface::CompactionKeySpace; + +pub struct MockRequestContext {} +impl interface::CompactionRequestContext for MockRequestContext {} + +pub type Key = u64; + +impl interface::CompactionKey for Key { + const MIN: Self = u64::MIN; + const MAX: Self = u64::MAX; + + fn key_range_size(key_range: &Range, _shard_identity: &ShardIdentity) -> u32 { + std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32 + } + + fn next(&self) -> Self { + self + 1 + } + fn skip_some(&self) -> Self { + // round up to next xx + self + 100 + } +} + +#[derive(Clone)] +pub struct MockRecord { + lsn: Lsn, + key: Key, + len: u64, +} + +impl interface::CompactionDeltaEntry<'_, Key> for MockRecord { + fn key(&self) -> Key { + self.key + } + fn lsn(&self) -> Lsn { + self.lsn + } + fn size(&self) -> u64 { + self.len + } +} + +pub struct MockDeltaLayer { + pub key_range: Range, + pub lsn_range: Range, + + pub file_size: u64, + + pub deleted: Mutex, + + pub records: Vec, +} + +impl interface::CompactionLayer for Arc { + fn key_range(&self) -> &Range { + &self.key_range + } + fn lsn_range(&self) -> &Range { + &self.lsn_range + } + + fn file_size(&self) -> u64 { + self.file_size + } + + fn short_id(&self) -> String { + format!( + "{:016X}-{:016X}__{:08X}-{:08X}", + self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0 + ) + } + + fn is_delta(&self) -> bool { + true + } +} + +impl interface::CompactionDeltaLayer for Arc { + type DeltaEntry<'a> = MockRecord; + + async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result> { + Ok(self.records.clone()) + } +} + +pub struct MockImageLayer { + pub key_range: Range, + pub lsn_range: Range, + + pub file_size: u64, + + pub deleted: Mutex, +} + +impl interface::CompactionImageLayer for Arc {} + +impl interface::CompactionLayer for Arc { + fn key_range(&self) -> &Range { + &self.key_range + } + fn lsn_range(&self) -> &Range { + &self.lsn_range + } + + fn file_size(&self) -> u64 { + self.file_size + } + + fn short_id(&self) -> String { + format!( + "{:016X}-{:016X}__{:08X}", + self.key_range.start, self.key_range.end, self.lsn_range.start.0, + ) + } + + fn is_delta(&self) -> bool { + false + } +} + +impl MockTimeline { + pub fn new() -> Self { + MockTimeline { + target_file_size: 256 * 1024 * 1024, + tiers_per_level: 4, + + num_l0_flushes: 0, + last_compact_at_flush: 0, + last_flush_lsn: Lsn(0), + + records: Vec::new(), + total_len: 0, + start_lsn: Lsn(1000), + end_lsn: Lsn(1000), + keyspace: KeySpace::new(), + + old_keyspaces: vec![], + + live_layers: vec![], + + num_deleted_layers: 0, + + wal_ingested: 0, + bytes_written: 0, + bytes_deleted: 0, + layers_created: 0, + layers_deleted: 0, + + time: 0, + history: Vec::new(), + } + } + + pub async fn compact(&mut self) -> anyhow::Result<()> { + let ctx = MockRequestContext {}; + + crate::compact_tiered::compact_tiered( + self, + self.last_flush_lsn, + self.target_file_size, + self.tiers_per_level, + &ctx, + ) + .await?; + + Ok(()) + } + + // Ingest one record to the timeline + pub fn ingest_record(&mut self, key: Key, len: u64) { + self.records.push(MockRecord { + lsn: self.end_lsn, + key, + len, + }); + self.total_len += len; + self.end_lsn += len; + + if self.total_len > self.target_file_size { + self.flush_l0(); + } + } + + pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> { + if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level { + self.compact().await?; + self.last_compact_at_flush = self.num_l0_flushes; + } + Ok(()) + } + + pub fn flush_l0(&mut self) { + if self.records.is_empty() { + return; + } + + let mut records = std::mem::take(&mut self.records); + records.sort_by_key(|rec| rec.key); + + let lsn_range = self.start_lsn..self.end_lsn; + let new_layer = Arc::new(MockDeltaLayer { + key_range: Key::MIN..Key::MAX, + lsn_range: lsn_range.clone(), + file_size: self.total_len, + records, + deleted: Mutex::new(false), + }); + info!("flushed L0 layer {}", new_layer.short_id()); + self.live_layers.push(MockLayer::from(&new_layer)); + + // reset L0 + self.start_lsn = self.end_lsn; + self.total_len = 0; + self.records = Vec::new(); + + self.layers_created += 1; + self.bytes_written += new_layer.file_size; + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::Flush, + file: LayerTraceFile { + filename: new_layer.short_id(), + key_range: new_layer.key_range.clone(), + lsn_range: new_layer.lsn_range.clone(), + }, + }); + + self.num_l0_flushes += 1; + self.last_flush_lsn = self.end_lsn; + } + + // Ingest `num_records' records to the timeline, with random keys + // uniformly distributed in `key_range` + pub fn ingest_uniform( + &mut self, + num_records: u64, + len: u64, + key_range: &Range, + ) -> anyhow::Result<()> { + crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]); + let mut rng = rand::thread_rng(); + for _ in 0..num_records { + self.ingest_record(rng.gen_range(key_range.clone()), len); + self.wal_ingested += len; + } + Ok(()) + } + + pub fn stats(&self) -> anyhow::Result { + let mut s = String::new(); + + writeln!(s, "STATISTICS:")?; + writeln!( + s, + "WAL ingested: {:>10} MB", + self.wal_ingested / (1024 * 1024) + )?; + writeln!( + s, + "size created: {:>10} MB", + self.bytes_written / (1024 * 1024) + )?; + writeln!( + s, + "size deleted: {:>10} MB", + self.bytes_deleted / (1024 * 1024) + )?; + writeln!(s, "files created: {:>10}", self.layers_created)?; + writeln!(s, "files deleted: {:>10}", self.layers_deleted)?; + writeln!( + s, + "write amp: {:>10.2}", + self.bytes_written as f64 / self.wal_ingested as f64 + )?; + writeln!( + s, + "storage amp: {:>10.2}", + (self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64 + )?; + + Ok(s) + } + + pub fn draw_history(&self, output: W) -> anyhow::Result<()> { + draw::draw_history(&self.history, output) + } +} + +impl Default for MockTimeline { + fn default() -> Self { + Self::new() + } +} + +#[derive(Clone)] +pub enum MockLayer { + Delta(Arc), + Image(Arc), +} + +impl interface::CompactionLayer for MockLayer { + fn key_range(&self) -> &Range { + match self { + MockLayer::Delta(this) => this.key_range(), + MockLayer::Image(this) => this.key_range(), + } + } + fn lsn_range(&self) -> &Range { + match self { + MockLayer::Delta(this) => this.lsn_range(), + MockLayer::Image(this) => this.lsn_range(), + } + } + fn file_size(&self) -> u64 { + match self { + MockLayer::Delta(this) => this.file_size, + MockLayer::Image(this) => this.file_size, + } + } + fn short_id(&self) -> String { + match self { + MockLayer::Delta(this) => this.short_id(), + MockLayer::Image(this) => this.short_id(), + } + } + + fn is_delta(&self) -> bool { + match self { + MockLayer::Delta(_) => true, + MockLayer::Image(_) => false, + } + } +} + +impl MockLayer { + fn is_deleted(&self) -> bool { + let guard = match self { + MockLayer::Delta(this) => this.deleted.lock().unwrap(), + MockLayer::Image(this) => this.deleted.lock().unwrap(), + }; + *guard + } + fn mark_deleted(&self) { + let mut deleted_guard = match self { + MockLayer::Delta(this) => this.deleted.lock().unwrap(), + MockLayer::Image(this) => this.deleted.lock().unwrap(), + }; + assert!(!*deleted_guard, "layer already deleted"); + *deleted_guard = true; + } +} + +impl From<&Arc> for MockLayer { + fn from(l: &Arc) -> Self { + MockLayer::Delta(l.clone()) + } +} + +impl From<&Arc> for MockLayer { + fn from(l: &Arc) -> Self { + MockLayer::Image(l.clone()) + } +} + +impl interface::CompactionJobExecutor for MockTimeline { + type Key = Key; + type Layer = MockLayer; + type DeltaLayer = Arc; + type ImageLayer = Arc; + type RequestContext = MockRequestContext; + + fn get_shard_identity(&self) -> &ShardIdentity { + static IDENTITY: ShardIdentity = ShardIdentity::unsharded(); + &IDENTITY + } + + async fn get_layers( + &mut self, + key_range: &Range, + lsn_range: &Range, + _ctx: &Self::RequestContext, + ) -> anyhow::Result> { + // Clear any deleted layers from our vec + self.live_layers.retain(|l| !l.is_deleted()); + + let layers: Vec = self + .live_layers + .iter() + .filter(|l| { + overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range) + }) + .cloned() + .collect(); + + Ok(layers) + } + + async fn get_keyspace( + &mut self, + key_range: &Range, + _lsn: Lsn, + _ctx: &Self::RequestContext, + ) -> anyhow::Result> { + // find it in the levels + if self.old_keyspaces.is_empty() { + Ok(crate::helpers::intersect_keyspace( + &self.keyspace, + key_range, + )) + } else { + // not implemented + + // The mock implementation only allows requesting the + // keyspace at the level's end LSN. That's all that the + // current implementation needs. + panic!("keyspace not available for requested lsn"); + } + } + + async fn downcast_delta_layer( + &self, + layer: &MockLayer, + ) -> anyhow::Result>> { + Ok(match layer { + MockLayer::Delta(l) => Some(l.clone()), + MockLayer::Image(_) => None, + }) + } + + async fn create_image( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &MockRequestContext, + ) -> anyhow::Result<()> { + let keyspace = self.get_keyspace(key_range, lsn, ctx).await?; + + let mut accum_size: u64 = 0; + for r in keyspace { + accum_size += r.end - r.start; + } + + let new_layer = Arc::new(MockImageLayer { + key_range: key_range.clone(), + lsn_range: lsn..lsn, + file_size: accum_size * PAGE_SZ, + deleted: Mutex::new(false), + }); + info!( + "created image layer, size {}: {}", + new_layer.file_size, + new_layer.short_id() + ); + self.live_layers.push(MockLayer::Image(new_layer.clone())); + + // update stats + self.bytes_written += new_layer.file_size; + self.layers_created += 1; + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::CreateImage, + file: LayerTraceFile { + filename: new_layer.short_id(), + key_range: new_layer.key_range.clone(), + lsn_range: new_layer.lsn_range.clone(), + }, + }); + + Ok(()) + } + + async fn create_delta( + &mut self, + lsn_range: &Range, + key_range: &Range, + input_layers: &[Arc], + ctx: &MockRequestContext, + ) -> anyhow::Result<()> { + let mut key_value_stream = + std::pin::pin!(merge_delta_keys::(input_layers, ctx)); + let mut records: Vec = Vec::new(); + let mut total_len = 2; + while let Some(delta_entry) = key_value_stream.next().await { + let delta_entry: MockRecord = delta_entry?; + if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) { + total_len += delta_entry.len; + records.push(delta_entry); + } + } + let total_records = records.len(); + let new_layer = Arc::new(MockDeltaLayer { + key_range: key_range.clone(), + lsn_range: lsn_range.clone(), + file_size: total_len, + records, + deleted: Mutex::new(false), + }); + info!( + "created delta layer, recs {}, size {}: {}", + total_records, + total_len, + new_layer.short_id() + ); + self.live_layers.push(MockLayer::Delta(new_layer.clone())); + + // update stats + self.bytes_written += total_len; + self.layers_created += 1; + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::CreateDelta, + file: LayerTraceFile { + filename: new_layer.short_id(), + key_range: new_layer.key_range.clone(), + lsn_range: new_layer.lsn_range.clone(), + }, + }); + + Ok(()) + } + + async fn delete_layer( + &mut self, + layer: &Self::Layer, + _ctx: &MockRequestContext, + ) -> anyhow::Result<()> { + let layer = std::pin::pin!(layer); + info!("deleting layer: {}", layer.short_id()); + self.num_deleted_layers += 1; + self.bytes_deleted += layer.file_size(); + layer.mark_deleted(); + + self.time += 1; + self.history.push(LayerTraceEvent { + time_rel: self.time, + op: LayerTraceOp::Delete, + file: LayerTraceFile { + filename: layer.short_id(), + key_range: layer.key_range().clone(), + lsn_range: layer.lsn_range().clone(), + }, + }); + + Ok(()) + } +} diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs new file mode 100644 index 0000000000..997925067f --- /dev/null +++ b/pageserver/compaction/src/simulator/draw.rs @@ -0,0 +1,411 @@ +use super::Key; +use anyhow::Result; +use std::cmp::Ordering; +use std::{ + collections::{BTreeMap, BTreeSet, HashSet}, + fmt::Write, + ops::Range, +}; +use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style}; +use utils::lsn::Lsn; + +// Map values to their compressed coordinate - the index the value +// would have in a sorted and deduplicated list of all values. +struct CoordinateMap { + map: BTreeMap, + stretch: f32, +} + +impl CoordinateMap { + fn new(coords: Vec, stretch: f32) -> Self { + let set: BTreeSet = coords.into_iter().collect(); + + let mut map: BTreeMap = BTreeMap::new(); + for (i, e) in set.iter().enumerate() { + map.insert(*e, i); + } + + Self { map, stretch } + } + + // This assumes that the map contains an exact point for this. + // Use map_inexact for values inbetween + fn map(&self, val: T) -> f32 { + *self.map.get(&val).unwrap() as f32 * self.stretch + } + + // the value is still assumed to be within the min/max bounds + // (this is currently unused) + fn _map_inexact(&self, val: T) -> f32 { + let prev = *self.map.range(..=val).next().unwrap().1; + let next = *self.map.range(val..).next().unwrap().1; + + // interpolate + (prev as f32 + (next - prev) as f32) * self.stretch + } + + fn max(&self) -> f32 { + self.map.len() as f32 * self.stretch + } +} + +#[derive(PartialEq, Hash, Eq)] +pub enum LayerTraceOp { + Flush, + CreateDelta, + CreateImage, + Delete, +} + +impl std::fmt::Display for LayerTraceOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + let op_str = match self { + LayerTraceOp::Flush => "flush", + LayerTraceOp::CreateDelta => "create_delta", + LayerTraceOp::CreateImage => "create_image", + LayerTraceOp::Delete => "delete", + }; + f.write_str(op_str) + } +} + +#[derive(PartialEq, Hash, Eq, Clone)] +pub struct LayerTraceFile { + pub filename: String, + pub key_range: Range, + pub lsn_range: Range, +} + +impl LayerTraceFile { + fn is_image(&self) -> bool { + self.lsn_range.end == self.lsn_range.start + } +} + +pub struct LayerTraceEvent { + pub time_rel: u64, + pub op: LayerTraceOp, + pub file: LayerTraceFile, +} + +pub fn draw_history(history: &[LayerTraceEvent], mut output: W) -> Result<()> { + let mut files: Vec = Vec::new(); + + for event in history { + files.push(event.file.clone()); + } + let last_time_rel = history.last().unwrap().time_rel; + + // Collect all coordinates + let mut keys: Vec = vec![]; + let mut lsns: Vec = vec![]; + for f in files.iter() { + keys.push(f.key_range.start); + keys.push(f.key_range.end); + lsns.push(f.lsn_range.start); + lsns.push(f.lsn_range.end); + } + + // Analyze + let key_map = CoordinateMap::new(keys, 2.0); + // Stretch out vertically for better visibility + let lsn_map = CoordinateMap::new(lsns, 3.0); + + let mut svg = String::new(); + + // Draw + writeln!( + svg, + "{}", + BeginSvg { + w: key_map.max(), + h: lsn_map.max(), + } + )?; + let lsn_max = lsn_map.max(); + + // Sort the files by LSN, but so that image layers go after all delta layers + // The SVG is painted in the order the elements appear, and we want to draw + // image layers on top of the delta layers if they overlap + // + // (This could also be implemented via z coordinates: image layers get one z + // coord, delta layers get another z coord.) + let mut files_sorted: Vec = files.into_iter().collect(); + files_sorted.sort_by(|a, b| { + if a.is_image() && !b.is_image() { + Ordering::Greater + } else if !a.is_image() && b.is_image() { + Ordering::Less + } else { + a.lsn_range.end.cmp(&b.lsn_range.end) + } + }); + + writeln!(svg, "")?; + let mut files_seen = HashSet::new(); + for f in files_sorted { + if files_seen.contains(&f) { + continue; + } + let key_start = key_map.map(f.key_range.start); + let key_end = key_map.map(f.key_range.end); + let key_diff = key_end - key_start; + + if key_start >= key_end { + panic!("Invalid key range {}-{}", key_start, key_end); + } + + let lsn_start = lsn_map.map(f.lsn_range.start); + let lsn_end = lsn_map.map(f.lsn_range.end); + + // Fill in and thicken rectangle if it's an + // image layer so that we can see it. + let mut style = Style::default(); + style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); + style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5); + + let y_start = lsn_max - lsn_start; + let y_end = lsn_max - lsn_end; + + let x_margin = 0.25; + let y_margin = 0.5; + + match f.lsn_range.start.cmp(&f.lsn_range.end) { + Ordering::Less => { + write!( + svg, + r#" "#, + f.filename, + key_start + x_margin, + y_end + y_margin, + key_diff - x_margin * 2.0, + y_start - y_end - y_margin * 2.0, + 1.0, // border_radius, + style, + )?; + write!(svg, "{}", f.filename)?; + writeln!(svg, "")?; + } + Ordering::Equal => { + //lsn_diff = 0.3; + //lsn_offset = -lsn_diff / 2.0; + //margin = 0.05; + style.fill = Fill::Color(rgb(0x80, 0, 0x80)); + style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0); + write!( + svg, + r#" "#, + f.filename, + key_start + x_margin, + y_end, + key_end - x_margin, + y_end, + style, + )?; + write!( + svg, + "{}<br>{} - {}", + f.filename, lsn_end, y_end + )?; + writeln!(svg, "")?; + } + Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), + } + files_seen.insert(f); + } + + let mut record_style = Style::default(); + record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80)); + record_style.stroke = Stroke::None; + + writeln!(svg, "{}", EndSvg)?; + + let mut layer_events_str = String::new(); + let mut first = true; + for e in history { + if !first { + writeln!(layer_events_str, ",")?; + } + write!( + layer_events_str, + r#" {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#, + e.time_rel, e.file.filename, e.op + )?; + first = false; + } + writeln!(layer_events_str)?; + + writeln!( + output, + r#" + + + + + + + + +

+ +
+{svg} +
+ + +"# + )?; + + Ok(()) +} diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs new file mode 100644 index 0000000000..bd8b54a286 --- /dev/null +++ b/pageserver/compaction/tests/tests.rs @@ -0,0 +1,70 @@ +use once_cell::sync::OnceCell; +use pageserver_compaction::interface::CompactionLayer; +use pageserver_compaction::simulator::MockTimeline; +use utils::logging; + +static LOG_HANDLE: OnceCell<()> = OnceCell::new(); + +pub(crate) fn setup_logging() { + LOG_HANDLE.get_or_init(|| { + logging::init( + logging::LogFormat::Test, + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + ) + .expect("Failed to init test logging") + }); +} + +/// Test the extreme case that there are so many updates for a single key that +/// even if we produce an extremely narrow delta layer, spanning just that one +/// key, we still too many records to fit in the target file size. We need to +/// split in the LSN dimension too in that case. +#[tokio::test] +async fn test_many_updates_for_single_key() { + setup_logging(); + let mut executor = MockTimeline::new(); + executor.target_file_size = 1_000_000; // 1 MB + + // Ingest 10 MB of updates to a single key. + for _ in 1..1000 { + executor.ingest_uniform(100, 10, &(0..100_000)).unwrap(); + executor.ingest_uniform(1000, 10, &(0..1)).unwrap(); + executor.compact().await.unwrap(); + } + + // Check that all the layers are smaller than the target size (with some slop) + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } + for l in executor.live_layers.iter() { + assert!(l.file_size() < executor.target_file_size * 2); + // Sanity check that none of the delta layers are empty either. + if l.is_delta() { + assert!(l.file_size() > 0); + } + } +} + +#[tokio::test] +async fn test_simple_updates() { + setup_logging(); + let mut executor = MockTimeline::new(); + executor.target_file_size = 500_000; // 500 KB + + // Ingest some traffic. + for _ in 1..400 { + executor.ingest_uniform(100, 500, &(0..100_000)).unwrap(); + } + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } + + println!("Running compaction..."); + executor.compact().await.unwrap(); + + for l in executor.live_layers.iter() { + println!("layer {}: {}", l.short_id(), l.file_size()); + } +} diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml index c5cd451e8d..be5626040b 100644 --- a/pageserver/ctl/Cargo.toml +++ b/pageserver/ctl/Cargo.toml @@ -12,9 +12,15 @@ bytes.workspace = true camino.workspace = true clap = { workspace = true, features = ["string"] } git-version.workspace = true +humantime.workspace = true pageserver = { path = ".." } +pageserver_api.workspace = true +remote_storage = { path = "../../libs/remote_storage" } postgres_ffi.workspace = true +thiserror.workspace = true tokio.workspace = true +tokio-util.workspace = true +toml_edit.workspace = true utils.workspace = true svg_fmt.workspace = true workspace_hack.workspace = true diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 0e77ef0563..389519c65a 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -9,21 +9,49 @@ //! Coordinates in both axis are compressed for better readability. //! (see ) //! -//! Example use: +//! The plain text API was chosen so that we can easily work with filenames from various +//! sources; see the Usage section below for examples. +//! +//! # Usage +//! +//! ## Producing the SVG +//! //! ```bash -//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ -//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg -//! $ firefox out.svg +//! +//! # local timeline dir +//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \ +//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg +//! +//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer` +//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg +//! +//! # From an `index_part.json` in S3 +//! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg +//! +//! # enrich with lines for gc_cutoff and a child branch point +//! cat <(jq -r '.historic_layers[] | .layer_file_name' < layers.json) <(echo -e 'gc_cutoff:0000001CE3FE32C9\nbranch:0000001DE3FE32C9') | cargo run --bin pagectl draw-timeline >| out.svg //! ``` //! -//! This API was chosen so that we can easily work with filenames extracted from ssh, -//! or from pageserver log files. +//! ## Viewing //! -//! TODO Consider shipping this as a grafana panel plugin: -//! -use anyhow::Result; +//! **Inkscape** is better than the built-in viewers in browsers. +//! +//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X) +//! to see the layer file name in the comment field. +//! +//! ```bash +//! +//! # Linux +//! inkscape out.svg +//! +//! # macOS +//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg +//! +//! ``` +//! + +use anyhow::{Context, Result}; use pageserver::repository::Key; -use pageserver::METADATA_FILE_NAME; use std::cmp::Ordering; use std::io::{self, BufRead}; use std::path::PathBuf; @@ -54,6 +82,11 @@ fn parse_filename(name: &str) -> (Range, Range) { let split: Vec<&str> = name.split("__").collect(); let keys: Vec<&str> = split[0].split('-').collect(); let mut lsns: Vec<&str> = split[1].split('-').collect(); + + if lsns.last().expect("should").len() == 8 { + lsns.pop(); + } + if lsns.len() == 1 { lsns.push(lsns[0]); } @@ -63,33 +96,94 @@ fn parse_filename(name: &str) -> (Range, Range) { (keys, lsns) } +#[derive(Clone, Copy)] +enum LineKind { + GcCutoff, + Branch, +} + +impl From for Fill { + fn from(value: LineKind) -> Self { + match value { + LineKind::GcCutoff => Fill::Color(rgb(255, 0, 0)), + LineKind::Branch => Fill::Color(rgb(0, 255, 0)), + } + } +} + +impl FromStr for LineKind { + type Err = anyhow::Error; + + fn from_str(s: &str) -> std::prelude::v1::Result { + Ok(match s { + "gc_cutoff" => LineKind::GcCutoff, + "branch" => LineKind::Branch, + _ => anyhow::bail!("unsupported linekind: {s}"), + }) + } +} + pub fn main() -> Result<()> { // Parse layer filenames from stdin - let mut ranges: Vec<(Range, Range)> = vec![]; + struct Layer { + filename: String, + key_range: Range, + lsn_range: Range, + } + let mut files: Vec = vec![]; let stdin = io::stdin(); - for line in stdin.lock().lines() { + + let mut lines: Vec<(Lsn, LineKind)> = vec![]; + + for (lineno, line) in stdin.lock().lines().enumerate() { + let lineno = lineno + 1; + let line = line.unwrap(); + if let Some((kind, lsn)) = line.split_once(':') { + let (kind, lsn) = LineKind::from_str(kind) + .context("parse kind") + .and_then(|kind| { + if lsn.contains('/') { + Lsn::from_str(lsn) + } else { + Lsn::from_hex(lsn) + } + .map(|lsn| (kind, lsn)) + .context("parse lsn") + }) + .with_context(|| format!("parse {line:?} on {lineno}"))?; + lines.push((lsn, kind)); + continue; + } let line = PathBuf::from_str(&line).unwrap(); let filename = line.file_name().unwrap(); let filename = filename.to_str().unwrap(); - if filename == METADATA_FILE_NAME { - // Don't try and parse "metadata" like a key-lsn range - continue; - } - let range = parse_filename(filename); - ranges.push(range); + let (key_range, lsn_range) = parse_filename(filename); + files.push(Layer { + filename: filename.to_owned(), + key_range, + lsn_range, + }); } // Collect all coordinates - let mut keys: Vec = vec![]; - let mut lsns: Vec = vec![]; - for (keyr, lsnr) in &ranges { + let mut keys: Vec = Vec::with_capacity(files.len()); + let mut lsns: Vec = Vec::with_capacity(files.len() + lines.len()); + + for Layer { + key_range: keyr, + lsn_range: lsnr, + .. + } in &files + { keys.push(keyr.start); keys.push(keyr.end); lsns.push(lsnr.start); lsns.push(lsnr.end); } + lsns.extend(lines.iter().map(|(lsn, _)| *lsn)); + // Analyze let key_map = build_coordinate_compression_map(keys); let lsn_map = build_coordinate_compression_map(lsns); @@ -103,11 +197,19 @@ pub fn main() -> Result<()> { println!( "{}", BeginSvg { - w: key_map.len() as f32, + w: (key_map.len() + 10) as f32, h: stretch * lsn_map.len() as f32 } ); - for (keyr, lsnr) in &ranges { + + let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas + + for Layer { + filename, + key_range: keyr, + lsn_range: lsnr, + } in &files + { let key_start = *key_map.get(&keyr.start).unwrap(); let key_end = *key_map.get(&keyr.end).unwrap(); let key_diff = key_end - key_start; @@ -123,7 +225,6 @@ pub fn main() -> Result<()> { let mut lsn_diff = (lsn_end - lsn_start) as f32; let mut fill = Fill::None; let mut ymargin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas - let xmargin = 0.05; // Height-dependent margin to disambiguate overlapping deltas let mut lsn_offset = 0.0; // Fill in and thicken rectangle if it's an @@ -143,7 +244,7 @@ pub fn main() -> Result<()> { println!( " {}", rectangle( - key_start as f32 + stretch * xmargin, + 5.0 + key_start as f32 + stretch * xmargin, stretch * (lsn_max as f32 - (lsn_end as f32 - ymargin - lsn_offset)), key_diff as f32 - stretch * 2.0 * xmargin, stretch * (lsn_diff - 2.0 * ymargin) @@ -151,8 +252,29 @@ pub fn main() -> Result<()> { .fill(fill) .stroke(Stroke::Color(rgb(0, 0, 0), 0.1)) .border_radius(0.4) + .comment(filename) ); } + + for (lsn, kind) in lines { + let lsn_start = *lsn_map.get(&lsn).unwrap(); + let lsn_end = lsn_start; + let stretch = 2.0; + let lsn_diff = 0.3; + let lsn_offset = -lsn_diff / 2.0; + let ymargin = 0.05; + println!( + "{}", + rectangle( + 0.0f32 + stretch * xmargin, + stretch * (lsn_map.len() as f32 - (lsn_end as f32 - ymargin - lsn_offset)), + (key_map.len() + 10) as f32, + stretch * (lsn_diff - 2.0 * ymargin) + ) + .fill(kind) + ); + } + println!("{}", EndSvg); eprintln!("num_images: {}", num_images); diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 20e5572914..20018846f8 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -1,11 +1,6 @@ -use std::collections::HashMap; - use anyhow::Context; use camino::Utf8PathBuf; -use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; -use pageserver::tenant::storage_layer::LayerFileName; -use pageserver::tenant::{metadata::TimelineMetadata, IndexPart}; -use utils::lsn::Lsn; +use pageserver::tenant::IndexPart; #[derive(clap::Subcommand)] pub(crate) enum IndexPartCmd { @@ -17,20 +12,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { IndexPartCmd::Dump { path } => { let bytes = tokio::fs::read(path).await.context("read file")?; let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?; - #[derive(serde::Serialize)] - struct Output<'a> { - layer_metadata: &'a HashMap, - disk_consistent_lsn: Lsn, - timeline_metadata: &'a TimelineMetadata, - } - - let output = Output { - layer_metadata: &des.layer_metadata, - disk_consistent_lsn: des.get_disk_consistent_lsn(), - timeline_metadata: &des.metadata, - }; - - let output = serde_json::to_string_pretty(&output).context("serialize output")?; + let output = serde_json::to_string_pretty(&des).context("serialize output")?; println!("{output}"); Ok(()) } diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs new file mode 100644 index 0000000000..af4b5a21ab --- /dev/null +++ b/pageserver/ctl/src/key.rs @@ -0,0 +1,475 @@ +use anyhow::Context; +use clap::Parser; +use pageserver_api::{ + key::Key, + reltag::{BlockNumber, RelTag, SlruKind}, + shard::{ShardCount, ShardStripeSize}, +}; +use std::str::FromStr; + +#[derive(Parser)] +pub(super) struct DescribeKeyCommand { + /// Key material in one of the forms: hex, span attributes captured from log, reltag blocknum + input: Vec, + + /// The number of shards to calculate what Keys placement would be. + #[arg(long)] + shard_count: Option, + + /// The sharding stripe size. + /// + /// The default is hardcoded. It makes no sense to provide this without providing + /// `--shard-count`. + #[arg(long, requires = "shard_count")] + stripe_size: Option, +} + +/// Sharded shard count without unsharded count, which the actual ShardCount supports. +#[derive(Clone, Copy)] +pub(super) struct CustomShardCount(std::num::NonZeroU8); + +#[derive(Debug, thiserror::Error)] +pub(super) enum InvalidShardCount { + #[error(transparent)] + ParsingFailed(#[from] std::num::ParseIntError), + #[error("too few shards")] + TooFewShards, +} + +impl FromStr for CustomShardCount { + type Err = InvalidShardCount; + + fn from_str(s: &str) -> Result { + let inner: std::num::NonZeroU8 = s.parse()?; + if inner.get() < 2 { + Err(InvalidShardCount::TooFewShards) + } else { + Ok(CustomShardCount(inner)) + } + } +} + +impl From for ShardCount { + fn from(value: CustomShardCount) -> Self { + ShardCount::new(value.0.get()) + } +} + +impl DescribeKeyCommand { + pub(super) fn execute(self) { + let DescribeKeyCommand { + input, + shard_count, + stripe_size, + } = self; + + let material = KeyMaterial::try_from(input.as_slice()).unwrap(); + let kind = material.kind(); + let key = Key::from(material); + + println!("parsed from {kind}: {key}:"); + println!(); + println!("{key:?}"); + + macro_rules! kind_query { + ([$($name:ident),*$(,)?]) => {{[$(kind_query!($name)),*]}}; + ($name:ident) => {{ + let s: &'static str = stringify!($name); + let s = s.strip_prefix("is_").unwrap_or(s); + let s = s.strip_suffix("_key").unwrap_or(s); + + #[allow(clippy::needless_borrow)] + (s, key.$name()) + }}; + } + + // the current characterization is a mess of these boolean queries and separate + // "recognization". I think it accurately represents how strictly we model the Key + // right now, but could of course be made less confusing. + + let queries = kind_query!([ + is_rel_block_key, + is_rel_vm_block_key, + is_rel_fsm_block_key, + is_slru_block_key, + is_inherited_key, + is_rel_size_key, + is_slru_segment_size_key, + ]); + + let recognized_kind = "recognized kind"; + let metadata_key = "metadata key"; + let shard_placement = "shard placement"; + + let longest = queries + .iter() + .map(|t| t.0) + .chain([recognized_kind, metadata_key, shard_placement]) + .map(|s| s.len()) + .max() + .unwrap(); + + let colon = 1; + let padding = 1; + + for (name, is) in queries { + let width = longest - name.len() + colon + padding; + println!("{}{:width$}{}", name, ":", is); + } + + let width = longest - recognized_kind.len() + colon + padding; + println!( + "{}{:width$}{:?}", + recognized_kind, + ":", + RecognizedKeyKind::new(key), + ); + + if let Some(shard_count) = shard_count { + // seeing the sharding placement might be confusing, so leave it out unless shard + // count was given. + + let stripe_size = stripe_size.map(ShardStripeSize).unwrap_or_default(); + println!( + "# placement with shard_count: {} and stripe_size: {}:", + shard_count.0, stripe_size.0 + ); + let width = longest - shard_placement.len() + colon + padding; + println!( + "{}{:width$}{:?}", + shard_placement, + ":", + pageserver_api::shard::describe(&key, shard_count.into(), stripe_size) + ); + } + } +} + +/// Hand-wavy "inputs we accept" for a key. +#[derive(Debug)] +pub(super) enum KeyMaterial { + Hex(Key), + String(SpanAttributesFromLogs), + Split(RelTag, BlockNumber), +} + +impl KeyMaterial { + fn kind(&self) -> &'static str { + match self { + KeyMaterial::Hex(_) => "hex", + KeyMaterial::String(_) | KeyMaterial::Split(_, _) => "split", + } + } +} + +impl From for Key { + fn from(value: KeyMaterial) -> Self { + match value { + KeyMaterial::Hex(key) => key, + KeyMaterial::String(SpanAttributesFromLogs(rt, blocknum)) + | KeyMaterial::Split(rt, blocknum) => { + pageserver_api::key::rel_block_to_key(rt, blocknum) + } + } + } +} + +impl> TryFrom<&[S]> for KeyMaterial { + type Error = anyhow::Error; + + fn try_from(value: &[S]) -> Result { + match value { + [] => anyhow::bail!( + "need 1..N positional arguments describing the key, try hex or a log line" + ), + [one] => { + let one = one.as_ref(); + + let key = Key::from_hex(one).map(KeyMaterial::Hex); + + let attrs = SpanAttributesFromLogs::from_str(one).map(KeyMaterial::String); + + match (key, attrs) { + (Ok(key), _) => Ok(key), + (_, Ok(s)) => Ok(s), + (Err(e1), Err(e2)) => anyhow::bail!( + "failed to parse {one:?} as hex or span attributes:\n- {e1:#}\n- {e2:#}" + ), + } + } + more => { + // assume going left to right one of these is a reltag and then we find a blocknum + // this works, because we don't have plain numbers at least right after reltag in + // logs. for some definition of "works". + + let Some((reltag_at, reltag)) = more + .iter() + .map(AsRef::as_ref) + .enumerate() + .find_map(|(i, s)| { + s.split_once("rel=") + .map(|(_garbage, actual)| actual) + .unwrap_or(s) + .parse::() + .ok() + .map(|rt| (i, rt)) + }) + else { + anyhow::bail!("found no RelTag in arguments"); + }; + + let Some(blocknum) = more + .iter() + .map(AsRef::as_ref) + .skip(reltag_at) + .find_map(|s| { + s.split_once("blkno=") + .map(|(_garbage, actual)| actual) + .unwrap_or(s) + .parse::() + .ok() + }) + else { + anyhow::bail!("found no blocknum in arguments"); + }; + + Ok(KeyMaterial::Split(reltag, blocknum)) + } + } + } +} + +#[derive(Debug)] +pub(super) struct SpanAttributesFromLogs(RelTag, BlockNumber); + +impl std::str::FromStr for SpanAttributesFromLogs { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + // accept the span separator but do not require or fail if either is missing + // "whatever{rel=1663/16389/24615 blkno=1052204 req_lsn=FFFFFFFF/FFFFFFFF}" + let (_, reltag) = s + .split_once("rel=") + .ok_or_else(|| anyhow::anyhow!("cannot find 'rel='"))?; + let reltag = reltag.split_whitespace().next().unwrap(); + + let (_, blocknum) = s + .split_once("blkno=") + .ok_or_else(|| anyhow::anyhow!("cannot find 'blkno='"))?; + let blocknum = blocknum.split_whitespace().next().unwrap(); + + let reltag = reltag + .parse() + .with_context(|| format!("parse reltag from {reltag:?}"))?; + let blocknum = blocknum + .parse() + .with_context(|| format!("parse blocknum from {blocknum:?}"))?; + + Ok(Self(reltag, blocknum)) + } +} + +#[derive(Debug)] +#[allow(dead_code)] // debug print is used +enum RecognizedKeyKind { + DbDir, + ControlFile, + Checkpoint, + AuxFilesV1, + SlruDir(Result), + RelMap(RelTagish<2>), + RelDir(RelTagish<2>), + AuxFileV2(Result>), +} + +#[derive(Debug, PartialEq)] +#[allow(unused)] +enum AuxFileV2 { + Recognized(&'static str, utils::Hex<[u8; 13]>), + OtherWithPrefix(&'static str, utils::Hex<[u8; 13]>), + Other(utils::Hex<[u8; 13]>), +} + +impl RecognizedKeyKind { + fn new(key: Key) -> Option { + use RecognizedKeyKind::{ + AuxFilesV1, Checkpoint, ControlFile, DbDir, RelDir, RelMap, SlruDir, + }; + + let slru_dir_kind = pageserver_api::key::slru_dir_kind(&key); + + Some(match key { + pageserver_api::key::DBDIR_KEY => DbDir, + pageserver_api::key::CONTROLFILE_KEY => ControlFile, + pageserver_api::key::CHECKPOINT_KEY => Checkpoint, + pageserver_api::key::AUX_FILES_KEY => AuxFilesV1, + _ if slru_dir_kind.is_some() => SlruDir(slru_dir_kind.unwrap()), + _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 0 => { + RelMap([key.field2, key.field3].into()) + } + _ if key.field1 == 0 && key.field4 == 0 && key.field5 == 0 && key.field6 == 1 => { + RelDir([key.field2, key.field3].into()) + } + _ if key.is_metadata_key() => RecognizedKeyKind::AuxFileV2( + AuxFileV2::new(key).ok_or_else(|| utils::Hex(key.to_i128().to_be_bytes())), + ), + _ => return None, + }) + } +} + +impl AuxFileV2 { + fn new(key: Key) -> Option { + const EMPTY_HASH: [u8; 13] = { + let mut out = [0u8; 13]; + let hash = pageserver::aux_file::fnv_hash(b"").to_be_bytes(); + let mut i = 3; + while i < 16 { + out[i - 3] = hash[i]; + i += 1; + } + out + }; + + let bytes = key.to_i128().to_be_bytes(); + let hash = utils::Hex(<[u8; 13]>::try_from(&bytes[3..]).unwrap()); + + assert_eq!(EMPTY_HASH.len(), hash.0.len()); + + // TODO: we could probably find the preimages for the hashes + + Some(match (bytes[1], bytes[2]) { + (1, 1) => AuxFileV2::Recognized("pg_logical/mappings/", hash), + (1, 2) => AuxFileV2::Recognized("pg_logical/snapshots/", hash), + (1, 3) if hash.0 == EMPTY_HASH => { + AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash) + } + (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash), + (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash), + (0xff, 0xff) => AuxFileV2::Other(hash), + _ => return None, + }) + } +} + +/// Prefix of RelTag, currently only known use cases are the two item versions. +/// +/// Renders like a reltag with `/`, nothing else. +struct RelTagish([u32; N]); + +impl From<[u32; N]> for RelTagish { + fn from(val: [u32; N]) -> Self { + RelTagish(val) + } +} + +impl std::fmt::Debug for RelTagish { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use std::fmt::Write as _; + let mut first = true; + self.0.iter().try_for_each(|x| { + if !first { + f.write_char('/')?; + } + first = false; + write!(f, "{}", x) + }) + } +} + +#[cfg(test)] +mod tests { + use pageserver::aux_file::encode_aux_file_key; + + use super::*; + + #[test] + fn hex_is_key_material() { + let m = KeyMaterial::try_from(&["000000067F0000400200DF927900FFFFFFFF"][..]).unwrap(); + assert!(matches!(m, KeyMaterial::Hex(_)), "{m:?}"); + } + + #[test] + fn single_positional_spanalike_is_key_material() { + // why is this needed? if you are checking many, then copypaste starts to appeal + let strings = [ + (line!(), "2024-05-15T15:33:49.873906Z ERROR page_service_conn_main{peer_addr=A:B}:process_query{tenant_id=C timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm blkno=2 req_lsn=0/238D98C8}: error reading relation or page version: Read error: could not find data for key 000000067F00032CE5000000000000000001 (shard ShardNumber(0)) at LSN 0/1D0A16C1, request LSN 0/238D98C8, ancestor 0/0"), + (line!(), "rel=1663/208101/2620_fsm blkno=2"), + (line!(), "rel=1663/208101/2620.1 blkno=2"), + ]; + + let mut first: Option = None; + + for (line, example) in strings { + let m = KeyMaterial::try_from(&[example][..]) + .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}")); + let key = Key::from(m); + if let Some(first) = first { + assert_eq!(first, key); + } else { + first = Some(key); + } + } + + // not supporting this is rather accidential, but I think the input parsing is lenient + // enough already + KeyMaterial::try_from(&["1663/208101/2620_fsm 2"][..]).unwrap_err(); + } + + #[test] + fn multiple_spanlike_args() { + let strings = [ + (line!(), &["process_query{tenant_id=C", "timeline_id=D}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1663/208101/2620_fsm", "blkno=2", "req_lsn=0/238D98C8}"][..]), + (line!(), &["rel=1663/208101/2620_fsm", "blkno=2"][..]), + (line!(), &["1663/208101/2620_fsm", "2"][..]), + ]; + + let mut first: Option = None; + + for (line, example) in strings { + let m = KeyMaterial::try_from(example) + .unwrap_or_else(|e| panic!("failed to parse example from line {line}: {e:?}")); + let key = Key::from(m); + if let Some(first) = first { + assert_eq!(first, key); + } else { + first = Some(key); + } + } + } + #[test] + fn recognized_auxfiles() { + use AuxFileV2::*; + + let empty = [ + 0x2e, 0x07, 0xbb, 0x01, 0x42, 0x62, 0xb8, 0x21, 0x75, 0x62, 0x95, 0xc5, 0x8d, + ]; + let foobar = [ + 0x62, 0x79, 0x3c, 0x64, 0xbf, 0x6f, 0x0d, 0x35, 0x97, 0xba, 0x44, 0x6f, 0x18, + ]; + + #[rustfmt::skip] + let examples = [ + (line!(), "pg_logical/mappings/foobar", Recognized("pg_logical/mappings/", utils::Hex(foobar))), + (line!(), "pg_logical/snapshots/foobar", Recognized("pg_logical/snapshots/", utils::Hex(foobar))), + (line!(), "pg_logical/replorigin_checkpoint", Recognized("pg_logical/replorigin_checkpoint", utils::Hex(empty))), + (line!(), "pg_logical/foobar", OtherWithPrefix("pg_logical/", utils::Hex(foobar))), + (line!(), "pg_replslot/foobar", Recognized("pg_replslot/", utils::Hex(foobar))), + (line!(), "foobar", Other(utils::Hex(foobar))), + ]; + + for (line, path, expected) in examples { + let key = encode_aux_file_key(path); + let recognized = + AuxFileV2::new(key).unwrap_or_else(|| panic!("line {line} example failed")); + + assert_eq!(recognized, expected); + } + + assert_eq!( + AuxFileV2::new(Key::from_hex("600000102000000000000000000000000000").unwrap()), + None, + "example key has one too few 0 after 6 before 1" + ); + } +} diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 15d4eb09e0..b4bb239f44 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -12,13 +12,13 @@ use std::collections::BinaryHeap; use std::ops::Range; use std::{fs, str}; -use pageserver::page_cache::PAGE_SZ; +use pageserver::page_cache::{self, PAGE_SZ}; use pageserver::repository::{Key, KEY_SIZE}; use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE}; use pageserver::tenant::storage_layer::range_overlaps; -use pageserver::virtual_file::VirtualFile; +use pageserver::virtual_file::{self, VirtualFile}; use utils::{bin_ser::BeSer, lsn::Lsn}; @@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option { // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result> { - let file = FileBlockReader::new(VirtualFile::open(path).await?); - let summary_blk = file.read_blk(0, ctx).await?; + let file = VirtualFile::open(path, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, actual_summary.index_root_blk, - file, + block_reader, ); // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); @@ -142,7 +144,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init(10); + pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index ebf4a4bec3..3611b0baab 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -59,15 +59,17 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); - let file = FileBlockReader::new(VirtualFile::open(path).await?); - let summary_blk = file.read_blk(0, ctx).await?; + let file = VirtualFile::open(path, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, actual_summary.index_root_blk, - &file, + &block_reader, ); // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API. let mut all = vec![]; @@ -83,7 +85,7 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result ctx, ) .await?; - let cursor = BlockCursor::new_fileblockreader(&file); + let cursor = BlockCursor::new_fileblockreader(&block_reader); for (k, v) in all { let value = cursor.read_blob(v.pos(), ctx).await?; println!("key:{} value_len:{}", k, value.len()); @@ -187,7 +189,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init(10); + pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index fb42d6d2f1..50c3ac4c61 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -6,9 +6,15 @@ mod draw_timeline_dir; mod index_part; +mod key; mod layer_map_analyzer; mod layers; +use std::{ + str::FromStr, + time::{Duration, SystemTime}, +}; + use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; use index_part::IndexPartCmd; @@ -20,8 +26,16 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; +use pageserver_api::shard::TenantShardId; use postgres_ffi::ControlFileData; -use utils::{lsn::Lsn, project_git_version}; +use remote_storage::{RemotePath, RemoteStorageConfig}; +use tokio_util::sync::CancellationToken; +use utils::{ + id::TimelineId, + logging::{self, LogFormat, TracingErrorLayerEnablement}, + lsn::Lsn, + project_git_version, +}; project_git_version!(GIT_VERSION); @@ -43,10 +57,13 @@ enum Commands { #[command(subcommand)] IndexPart(IndexPartCmd), PrintLayerFile(PrintLayerFileCmd), + TimeTravelRemotePrefix(TimeTravelRemotePrefixCmd), DrawTimeline {}, AnalyzeLayerMap(AnalyzeLayerMapCmd), #[command(subcommand)] Layer(LayerCmd), + /// Debug print a hex key found from logs + Key(key::DescribeKeyCommand), } /// Read and update pageserver metadata file @@ -68,6 +85,26 @@ struct PrintLayerFileCmd { path: Utf8PathBuf, } +/// Roll back the time for the specified prefix using S3 history. +/// +/// The command is fairly low level and powerful. Validation is only very light, +/// so it is more powerful, and thus potentially more dangerous. +#[derive(Parser)] +struct TimeTravelRemotePrefixCmd { + /// A configuration string for the remote_storage configuration. + /// + /// Example: `remote_storage = { bucket_name = "aws-storage-bucket-name", bucket_region = "us-east-2" }` + config_toml_str: String, + /// remote prefix to time travel recover. For safety reasons, we require it to contain + /// a timeline or tenant ID in the prefix. + prefix: String, + /// Timestamp to travel to. Given in format like `2024-01-20T10:45:45Z`. Assumes UTC and second accuracy. + travel_to: String, + /// Timestamp of the start of the operation, must be after any changes we want to roll back and after. + /// You can use a few seconds before invoking the command. Same format as `travel_to`. + done_if_after: Option, +} + #[derive(Parser)] struct AnalyzeLayerMapCmd { /// Pageserver data path @@ -78,6 +115,14 @@ struct AnalyzeLayerMapCmd { #[tokio::main] async fn main() -> anyhow::Result<()> { + logging::init( + LogFormat::Plain, + TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + )?; + + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let cli = CliOpts::parse(); match cli.command { @@ -105,6 +150,43 @@ async fn main() -> anyhow::Result<()> { print_layerfile(&cmd.path).await?; } } + Commands::TimeTravelRemotePrefix(cmd) => { + let timestamp = humantime::parse_rfc3339(&cmd.travel_to) + .map_err(|_e| anyhow::anyhow!("Invalid time for travel_to: '{}'", cmd.travel_to))?; + + let done_if_after = if let Some(done_if_after) = &cmd.done_if_after { + humantime::parse_rfc3339(done_if_after).map_err(|_e| { + anyhow::anyhow!("Invalid time for done_if_after: '{}'", done_if_after) + })? + } else { + const SAFETY_MARGIN: Duration = Duration::from_secs(3); + tokio::time::sleep(SAFETY_MARGIN).await; + // Convert to string representation and back to get rid of sub-second values + let done_if_after = SystemTime::now(); + tokio::time::sleep(SAFETY_MARGIN).await; + done_if_after + }; + + let timestamp = strip_subsecond(timestamp); + let done_if_after = strip_subsecond(done_if_after); + + let Some(prefix) = validate_prefix(&cmd.prefix) else { + println!("specified prefix '{}' failed validation", cmd.prefix); + return Ok(()); + }; + let toml_document = toml_edit::Document::from_str(&cmd.config_toml_str)?; + let toml_item = toml_document + .get("remote_storage") + .expect("need remote_storage"); + let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); + let storage = remote_storage::GenericRemoteStorage::from_config(&config); + let cancel = CancellationToken::new(); + storage + .unwrap() + .time_travel_recover(Some(&prefix), timestamp, done_if_after, &cancel) + .await?; + } + Commands::Key(dkc) => dkc.execute(), }; Ok(()) } @@ -123,7 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init(10); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await @@ -141,6 +223,7 @@ fn handle_metadata( let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?; println!("Current metadata:\n{meta:?}"); let mut update_meta = false; + // TODO: simplify this part if let Some(disk_consistent_lsn) = disk_consistent_lsn { meta = TimelineMetadata::new( *disk_consistent_lsn, @@ -185,3 +268,89 @@ fn handle_metadata( Ok(()) } + +/// Ensures that the given S3 prefix is sufficiently constrained. +/// The command is very risky already and we don't want to expose something +/// that allows usually unintentional and quite catastrophic time travel of +/// an entire bucket, which would be a major catastrophy and away +/// by only one character change (similar to "rm -r /home /username/foobar"). +fn validate_prefix(prefix: &str) -> Option { + if prefix.is_empty() { + // Empty prefix means we want to specify the *whole* bucket + return None; + } + let components = prefix.split('/').collect::>(); + let (last, components) = { + let last = components.last()?; + if last.is_empty() { + ( + components.iter().nth_back(1)?, + &components[..(components.len() - 1)], + ) + } else { + (last, &components[..]) + } + }; + 'valid: { + if let Ok(_timeline_id) = TimelineId::from_str(last) { + // Ends in either a tenant or timeline ID + break 'valid; + } + if *last == "timelines" { + if let Some(before_last) = components.iter().nth_back(1) { + if let Ok(_tenant_id) = TenantShardId::from_str(before_last) { + // Has a valid tenant id + break 'valid; + } + } + } + + return None; + } + RemotePath::from_string(prefix).ok() +} + +fn strip_subsecond(timestamp: SystemTime) -> SystemTime { + let ts_str = humantime::format_rfc3339_seconds(timestamp).to_string(); + humantime::parse_rfc3339(&ts_str).expect("can't parse just created timestamp") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_prefix() { + assert_eq!(validate_prefix(""), None); + assert_eq!(validate_prefix("/"), None); + #[track_caller] + fn assert_valid(prefix: &str) { + let remote_path = RemotePath::from_string(prefix).unwrap(); + assert_eq!(validate_prefix(prefix), Some(remote_path)); + } + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/"); + // Path is not relative but absolute + assert_eq!( + validate_prefix( + "/wal/3aa8fcc61f6d357410b7de754b1d9001/641e5342083b2235ee3deb8066819683/" + ), + None + ); + assert_valid("wal/3aa8fcc61f6d357410b7de754b1d9001/"); + // Partial tenant IDs should be invalid, S3 will match all tenants with the specific ID prefix + assert_eq!(validate_prefix("wal/3aa8fcc61f6d357410b7d"), None); + assert_eq!(validate_prefix("wal"), None); + assert_eq!(validate_prefix("/wal/"), None); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001"); + // Partial tenant ID + assert_eq!( + validate_prefix("pageserver/v1/tenants/3aa8fcc61f6d357410b"), + None + ); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001-0004/timelines"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/"); + assert_valid("pageserver/v1/tenants/3aa8fcc61f6d357410b7de754b1d9001/timelines/641e5342083b2235ee3deb8066819683"); + assert_eq!(validate_prefix("pageserver/v1/tenants/"), None); + } +} diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml index 169d9b7f8e..245d293e4f 100644 --- a/pageserver/pagebench/Cargo.toml +++ b/pageserver/pagebench/Cargo.toml @@ -8,6 +8,7 @@ license.workspace = true [dependencies] anyhow.workspace = true +camino.workspace = true clap.workspace = true futures.workspace = true hdrhistogram.workspace = true @@ -18,8 +19,8 @@ serde.workspace = true serde_json.workspace = true tracing.workspace = true tokio.workspace = true +tokio-util.workspace = true -pageserver = { path = ".." } pageserver_client.workspace = true pageserver_api.workspace = true utils = { path = "../../libs/utils/" } diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs new file mode 100644 index 0000000000..bce3285606 --- /dev/null +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -0,0 +1,105 @@ +use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest}; +use pageserver_api::shard::TenantShardId; +use utils::id::TenantTimelineId; +use utils::lsn::Lsn; + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Instant; + +/// Ingest aux files into the pageserver. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, + #[clap(long)] + pageserver_jwt: Option, + + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let main_task = rt.spawn(main_impl(args)); + rt.block_on(main_task).unwrap() +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: None, + targets: { + if let Some(targets) = &args.targets { + if targets.len() != 1 { + anyhow::bail!("must specify exactly one target"); + } + Some(targets.clone()) + } else { + None + } + }, + }, + ) + .await?; + + let timeline = timelines[0]; + let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); + let timeline_id = timeline.timeline_id; + + println!("operating on timeline {}", timeline); + + mgmt_api_client + .tenant_config(&TenantConfigRequest { + tenant_id: timeline.tenant_id, + config: TenantConfig { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + }) + .await?; + + for batch in 0..100 { + let items = (0..100) + .map(|id| { + ( + format!("pg_logical/mappings/{:03}.{:03}", batch, id), + format!("{:08}", id), + ) + }) + .collect::>(); + let file_cnt = items.len(); + mgmt_api_client + .ingest_aux_files(tenant_shard_id, timeline_id, items) + .await?; + println!("ingested {file_cnt} files"); + } + + for _ in 0..100 { + let start = Instant::now(); + let files = mgmt_api_client + .list_aux_files(tenant_shard_id, timeline_id, Lsn(Lsn::MAX.0 - 1)) + .await?; + println!( + "{} files found in {}s", + files.len(), + start.elapsed().as_secs_f64() + ); + } + + anyhow::Ok(()) +} diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 85a3e695de..3ae6d99aa7 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,4 +1,6 @@ use anyhow::Context; +use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; use utils::id::TenantTimelineId; @@ -7,7 +9,7 @@ use utils::lsn::Lsn; use rand::prelude::*; use tokio::sync::Barrier; use tokio::task::JoinSet; -use tracing::{debug, info, instrument}; +use tracing::{info, instrument}; use std::collections::HashMap; use std::num::NonZeroUsize; @@ -24,8 +26,8 @@ use crate::util::{request_stats, tokio_thread_local_stats}; pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, - #[clap(long, default_value = "localhost:64000")] - page_service_host_port: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, #[clap(long)] pageserver_jwt: Option, #[clap(long, default_value = "1")] @@ -92,10 +94,12 @@ async fn main_impl( for timeline in &timelines { js.spawn({ let timeline = *timeline; - // FIXME: this triggers initial logical size calculation - // https://github.com/neondatabase/neon/issues/6168 let info = mgmt_api_client - .timeline_info(timeline.tenant_id, timeline.timeline_id) + .timeline_info( + TenantShardId::unsharded(timeline.tenant_id), + timeline.timeline_id, + ForceAwaitLogicalSize::No, + ) .await .unwrap(); async move { @@ -227,12 +231,9 @@ async fn client( ) { start_work_barrier.wait().await; - let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring( - &args.page_service_host_port, - args.pageserver_jwt.as_deref(), - )) - .await - .unwrap(); + let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await + .unwrap(); while let Some(Work { lsn, gzip }) = work.recv().await { let start = Instant::now(); @@ -260,7 +261,7 @@ async fn client( } }) .await; - debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); + info!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); let elapsed = start.elapsed(); live_stats.inc(); STATS.with(|stats| { diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index cb36a403f1..4992f37465 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -1,19 +1,19 @@ use anyhow::Context; -use futures::future::join_all; -use pageserver::pgdatadir_mapping::key_to_rel_block; -use pageserver::repository; -use pageserver_api::key::is_rel_block_key; +use camino::Utf8PathBuf; +use pageserver_api::key::Key; +use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::models::PagestreamGetPageRequest; +use pageserver_api::shard::TenantShardId; +use tokio_util::sync::CancellationToken; use utils::id::TenantTimelineId; use utils::lsn::Lsn; use rand::prelude::*; -use tokio::sync::Barrier; use tokio::task::JoinSet; -use tracing::{info, instrument}; +use tracing::info; -use std::collections::HashMap; +use std::collections::HashSet; use std::future::Future; use std::num::NonZeroUsize; use std::pin::Pin; @@ -37,28 +37,46 @@ pub(crate) struct Args { num_clients: NonZeroUsize, #[clap(long)] runtime: Option, + /// Each client sends requests at the given rate. + /// + /// If a request takes too long and we should be issuing a new request already, + /// we skip that request and account it as `MISSED`. #[clap(long)] - per_target_rate_limit: Option, + per_client_rate: Option, /// Probability for sending `latest=true` in the request (uniform distribution). #[clap(long, default_value = "1")] req_latest_probability: f64, #[clap(long)] limit_to_first_n_targets: Option, + /// For large pageserver installations, enumerating the keyspace takes a lot of time. + /// If specified, the specified path is used to maintain a cache of the keyspace enumeration result. + /// The cache is tagged and auto-invalided by the tenant/timeline ids only. + /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction. + #[clap(long)] + keyspace_cache: Option, + /// Before starting the benchmark, live-reconfigure the pageserver to use the given + /// [`pageserver_api::models::virtual_file::IoEngineKind`]. + #[clap(long)] + set_io_engine: Option, targets: Option>, } #[derive(Debug, Default)] struct LiveStats { completed_requests: AtomicU64, + missed: AtomicU64, } impl LiveStats { - fn inc(&self) { + fn request_done(&self) { self.completed_requests.fetch_add(1, Ordering::Relaxed); } + fn missed(&self, n: u64) { + self.missed.fetch_add(n, Ordering::Relaxed); + } } -#[derive(Clone)] +#[derive(Clone, serde::Serialize, serde::Deserialize)] struct KeyRange { timeline: TenantTimelineId, timeline_lsn: Lsn, @@ -72,6 +90,12 @@ impl KeyRange { } } +#[derive(PartialEq, Eq, Hash, Copy, Clone)] +struct WorkerId { + timeline: TenantTimelineId, + num_client: usize, // from 0..args.num_clients +} + #[derive(serde::Serialize)] struct Output { total: request_stats::Output, @@ -96,6 +120,10 @@ async fn main_impl( args.pageserver_jwt.as_deref(), )); + if let Some(engine_str) = &args.set_io_engine { + mgmt_api_client.put_io_engine(engine_str).await?; + } + // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, @@ -106,59 +134,109 @@ async fn main_impl( ) .await?; - let mut js = JoinSet::new(); - for timeline in &timelines { - js.spawn({ - let mgmt_api_client = Arc::clone(&mgmt_api_client); - let timeline = *timeline; - async move { - let partitioning = mgmt_api_client - .keyspace(timeline.tenant_id, timeline.timeline_id) - .await?; - let lsn = partitioning.at_lsn; - - let ranges = partitioning - .keys - .ranges - .iter() - .filter_map(|r| { - let start = r.start; - let end = r.end; - // filter out non-relblock keys - match (is_rel_block_key(&start), is_rel_block_key(&end)) { - (true, true) => Some(KeyRange { - timeline, - timeline_lsn: lsn, - start: start.to_i128(), - end: end.to_i128(), - }), - (true, false) | (false, true) => { - unimplemented!("split up range") + #[derive(serde::Deserialize)] + struct KeyspaceCacheDe { + tag: Vec, + data: Vec, + } + #[derive(serde::Serialize)] + struct KeyspaceCacheSer<'a> { + tag: &'a [TenantTimelineId], + data: &'a [KeyRange], + } + let cache = args + .keyspace_cache + .as_ref() + .map(|keyspace_cache_file| { + let contents = match std::fs::read(keyspace_cache_file) { + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + return anyhow::Ok(None); + } + x => x.context("read keyspace cache file")?, + }; + let cache: KeyspaceCacheDe = + serde_json::from_slice(&contents).context("deserialize cache file")?; + let tag_ok = HashSet::::from_iter(cache.tag.into_iter()) + == HashSet::from_iter(timelines.iter().cloned()); + info!("keyspace cache file matches tag: {tag_ok}"); + anyhow::Ok(if tag_ok { Some(cache.data) } else { None }) + }) + .transpose()? + .flatten(); + let all_ranges: Vec = if let Some(cached) = cache { + info!("using keyspace cache file"); + cached + } else { + let mut js = JoinSet::new(); + for timeline in &timelines { + js.spawn({ + let mgmt_api_client = Arc::clone(&mgmt_api_client); + let timeline = *timeline; + async move { + let partitioning = mgmt_api_client + .keyspace( + TenantShardId::unsharded(timeline.tenant_id), + timeline.timeline_id, + ) + .await?; + let lsn = partitioning.at_lsn; + let start = Instant::now(); + let mut filtered = KeySpaceAccum::new(); + // let's hope this is inlined and vectorized... + // TODO: turn this loop into a is_rel_block_range() function. + for r in partitioning.keys.ranges.iter() { + let mut i = r.start; + while i != r.end { + if i.is_rel_block_key() { + filtered.add_key(i); } - (false, false) => None, + i = i.next(); } - }) - .collect::>(); + } + let filtered = filtered.to_keyspace(); + let filter_duration = start.elapsed(); - anyhow::Ok(ranges) - } - }); - } - let mut all_ranges: Vec = Vec::new(); - while let Some(res) = js.join_next().await { - all_ranges.extend(res.unwrap().unwrap()); - } + anyhow::Ok(( + filter_duration, + filtered.ranges.into_iter().map(move |r| KeyRange { + timeline, + timeline_lsn: lsn, + start: r.start.to_i128(), + end: r.end.to_i128(), + }), + )) + } + }); + } + let mut total_filter_duration = Duration::from_secs(0); + let mut all_ranges: Vec = Vec::new(); + while let Some(res) = js.join_next().await { + let (filter_duration, range) = res.unwrap().unwrap(); + all_ranges.extend(range); + total_filter_duration += filter_duration; + } + info!("filter duration: {}", total_filter_duration.as_secs_f64()); + if let Some(cachefile) = args.keyspace_cache.as_ref() { + let cache = KeyspaceCacheSer { + tag: &timelines, + data: &all_ranges, + }; + let bytes = serde_json::to_vec(&cache).context("serialize keyspace for cache file")?; + std::fs::write(cachefile, bytes).context("write keyspace cache file to disk")?; + info!("successfully wrote keyspace cache file"); + } + all_ranges + }; let live_stats = Arc::new(LiveStats::default()); - let num_client_tasks = timelines.len(); let num_live_stats_dump = 1; - let num_work_sender_tasks = 1; + let num_work_sender_tasks = args.num_clients.get() * timelines.len(); + let num_main_impl = 1; let start_work_barrier = Arc::new(tokio::sync::Barrier::new( - num_client_tasks + num_live_stats_dump + num_work_sender_tasks, + num_live_stats_dump + num_work_sender_tasks + num_main_impl, )); - let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks)); tokio::spawn({ let stats = Arc::clone(&live_stats); @@ -169,132 +247,135 @@ async fn main_impl( let start = std::time::Instant::now(); tokio::time::sleep(std::time::Duration::from_secs(1)).await; let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed); + let missed = stats.missed.swap(0, Ordering::Relaxed); let elapsed = start.elapsed(); info!( - "RPS: {:.0}", - completed_requests as f64 / elapsed.as_secs_f64() + "RPS: {:.0} MISSED: {:.0}", + completed_requests as f64 / elapsed.as_secs_f64(), + missed as f64 / elapsed.as_secs_f64() ); } } }); - let mut work_senders = HashMap::new(); - let mut tasks = Vec::new(); - for tl in &timelines { - let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are - work_senders.insert(tl, sender); - tasks.push(tokio::spawn(client( - args, - *tl, - Arc::clone(&start_work_barrier), - receiver, - Arc::clone(&all_work_done_barrier), - Arc::clone(&live_stats), - ))); - } + let cancel = CancellationToken::new(); - let work_sender: Pin>> = match args.per_target_rate_limit { - None => Box::pin(async move { - let weights = rand::distributions::weighted::WeightedIndex::new( - all_ranges.iter().map(|v| v.len()), - ) - .unwrap(); - - start_work_barrier.wait().await; - - loop { - let (timeline, req) = { - let mut rng = rand::thread_rng(); - let r = &all_ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); - let key = repository::Key::from_i128(key); - let (rel_tag, block_no) = - key_to_rel_block(key).expect("we filter non-rel-block keys out above"); - ( - r.timeline, - PagestreamGetPageRequest { - latest: rng.gen_bool(args.req_latest_probability), - lsn: r.timeline_lsn, - rel: rel_tag, - blkno: block_no, - }, - ) - }; - let sender = work_senders.get(&timeline).unwrap(); - // TODO: what if this blocks? - sender.send(req).await.ok().unwrap(); - } - }), - Some(rps_limit) => Box::pin(async move { - let period = Duration::from_secs_f64(1.0 / (rps_limit as f64)); - - let make_timeline_task: &dyn Fn( - TenantTimelineId, - ) - -> Pin>> = &|timeline| { - let sender = work_senders.get(&timeline).unwrap(); - let ranges: Vec = all_ranges - .iter() - .filter(|r| r.timeline == timeline) - .cloned() - .collect(); - let weights = rand::distributions::weighted::WeightedIndex::new( - ranges.iter().map(|v| v.len()), - ) + let rps_period = args + .per_client_rate + .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64))); + let make_worker: &dyn Fn(WorkerId) -> Pin>> = &|worker_id| { + let live_stats = live_stats.clone(); + let start_work_barrier = start_work_barrier.clone(); + let ranges: Vec = all_ranges + .iter() + .filter(|r| r.timeline == worker_id.timeline) + .cloned() + .collect(); + let weights = + rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len())) .unwrap(); - Box::pin(async move { - let mut ticker = tokio::time::interval(period); - ticker.set_missed_tick_behavior( - /* TODO review this choice */ - tokio::time::MissedTickBehavior::Burst, - ); - loop { - ticker.tick().await; - let req = { - let mut rng = rand::thread_rng(); - let r = &ranges[weights.sample(&mut rng)]; - let key: i128 = rng.gen_range(r.start..r.end); - let key = repository::Key::from_i128(key); - let (rel_tag, block_no) = key_to_rel_block(key) - .expect("we filter non-rel-block keys out above"); - PagestreamGetPageRequest { - latest: rng.gen_bool(args.req_latest_probability), - lsn: r.timeline_lsn, - rel: rel_tag, - blkno: block_no, - } - }; - sender.send(req).await.ok().unwrap(); - } - }) - }; - - let tasks: Vec<_> = work_senders - .keys() - .map(|tl| make_timeline_task(**tl)) - .collect(); + let cancel = cancel.clone(); + Box::pin(async move { + let client = + pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await + .unwrap(); + let mut client = client + .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id) + .await + .unwrap(); start_work_barrier.wait().await; + let client_start = Instant::now(); + let mut ticks_processed = 0; + while !cancel.is_cancelled() { + // Detect if a request took longer than the RPS rate + if let Some(period) = &rps_period { + let periods_passed_until_now = + usize::try_from(client_start.elapsed().as_micros() / period.as_micros()) + .unwrap(); - join_all(tasks).await; - }), + if periods_passed_until_now > ticks_processed { + live_stats.missed((periods_passed_until_now - ticks_processed) as u64); + } + ticks_processed = periods_passed_until_now; + } + + let start = Instant::now(); + let req = { + let mut rng = rand::thread_rng(); + let r = &ranges[weights.sample(&mut rng)]; + let key: i128 = rng.gen_range(r.start..r.end); + let key = Key::from_i128(key); + assert!(key.is_rel_block_key()); + let (rel_tag, block_no) = key + .to_rel_block() + .expect("we filter non-rel-block keys out above"); + PagestreamGetPageRequest { + request_lsn: if rng.gen_bool(args.req_latest_probability) { + Lsn::MAX + } else { + r.timeline_lsn + }, + not_modified_since: r.timeline_lsn, + rel: rel_tag, + blkno: block_no, + } + }; + client.getpage(req).await.unwrap(); + let end = Instant::now(); + live_stats.request_done(); + ticks_processed += 1; + STATS.with(|stats| { + stats + .borrow() + .lock() + .unwrap() + .observe(end.duration_since(start)) + .unwrap(); + }); + + if let Some(period) = &rps_period { + let next_at = client_start + + Duration::from_micros( + (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(), + ); + tokio::time::sleep_until(next_at.into()).await; + } + } + }) }; - if let Some(runtime) = args.runtime { - match tokio::time::timeout(runtime.into(), work_sender).await { - Ok(()) => unreachable!("work sender never terminates"), - Err(_timeout) => { - // this implicitly drops the work_senders, making all the clients exit - } + info!("spawning workers"); + let mut workers = JoinSet::new(); + for timeline in timelines.iter().cloned() { + for num_client in 0..args.num_clients.get() { + let worker_id = WorkerId { + timeline, + num_client, + }; + workers.spawn(make_worker(worker_id)); } - } else { - work_sender.await; - unreachable!("work sender never terminates"); } + let workers = async move { + while let Some(res) = workers.join_next().await { + res.unwrap(); + } + }; - for t in tasks { - t.await.unwrap(); + info!("waiting for everything to become ready"); + start_work_barrier.wait().await; + info!("work started"); + if let Some(runtime) = args.runtime { + tokio::time::sleep(runtime.into()).await; + info!("runtime over, signalling cancellation"); + cancel.cancel(); + workers.await; + info!("work sender exited"); + } else { + workers.await; + unreachable!("work sender never terminates"); } let output = Output { @@ -313,39 +394,3 @@ async fn main_impl( anyhow::Ok(()) } - -#[instrument(skip_all)] -async fn client( - args: &'static Args, - timeline: TenantTimelineId, - start_work_barrier: Arc, - mut work: tokio::sync::mpsc::Receiver, - all_work_done_barrier: Arc, - live_stats: Arc, -) { - start_work_barrier.wait().await; - - let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) - .await - .unwrap(); - let mut client = client - .pagestream(timeline.tenant_id, timeline.timeline_id) - .await - .unwrap(); - - while let Some(req) = work.recv().await { - let start = Instant::now(); - client - .getpage(req) - .await - .with_context(|| format!("getpage for {timeline}")) - .unwrap(); - let elapsed = start.elapsed(); - live_stats.inc(); - STATS.with(|stats| { - stats.borrow().lock().unwrap().observe(elapsed).unwrap(); - }); - } - - all_work_done_barrier.wait().await; -} diff --git a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs new file mode 100644 index 0000000000..1bb71b9353 --- /dev/null +++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs @@ -0,0 +1,333 @@ +use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId}; + +use pageserver_client::mgmt_api; +use rand::seq::SliceRandom; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info}; +use utils::id::{TenantTimelineId, TimelineId}; + +use std::{f64, sync::Arc}; +use tokio::{ + sync::{mpsc, OwnedSemaphorePermit}, + task::JoinSet, +}; + +use std::{ + num::NonZeroUsize, + sync::atomic::{AtomicU64, Ordering}, + time::{Duration, Instant}, +}; + +/// Evict & on-demand download random layers. +#[derive(clap::Parser)] +pub(crate) struct Args { + #[clap(long, default_value = "http://localhost:9898")] + mgmt_api_endpoint: String, + #[clap(long)] + pageserver_jwt: Option, + #[clap(long)] + runtime: Option, + #[clap(long, default_value = "1")] + tasks_per_target: NonZeroUsize, + #[clap(long, default_value = "1")] + concurrency_per_target: NonZeroUsize, + /// Probability for sending `latest=true` in the request (uniform distribution). + #[clap(long)] + limit_to_first_n_targets: Option, + /// Before starting the benchmark, live-reconfigure the pageserver to use the given + /// [`pageserver_api::models::virtual_file::IoEngineKind`]. + #[clap(long)] + set_io_engine: Option, + targets: Option>, +} + +pub(crate) fn main(args: Args) -> anyhow::Result<()> { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build()?; + let task = rt.spawn(main_impl(args)); + rt.block_on(task).unwrap().unwrap(); + Ok(()) +} + +#[derive(serde::Serialize)] +struct Output { + downloads_count: u64, + downloads_bytes: u64, + evictions_count: u64, + timeline_restarts: u64, + #[serde(with = "humantime_serde")] + runtime: Duration, +} + +#[derive(Debug, Default)] +struct LiveStats { + evictions_count: AtomicU64, + downloads_count: AtomicU64, + downloads_bytes: AtomicU64, + timeline_restarts: AtomicU64, +} + +impl LiveStats { + fn eviction_done(&self) { + self.evictions_count.fetch_add(1, Ordering::Relaxed); + } + fn download_done(&self, size: u64) { + self.downloads_count.fetch_add(1, Ordering::Relaxed); + self.downloads_bytes.fetch_add(size, Ordering::Relaxed); + } + fn timeline_restart_done(&self) { + self.timeline_restarts.fetch_add(1, Ordering::Relaxed); + } +} + +async fn main_impl(args: Args) -> anyhow::Result<()> { + let args: &'static Args = Box::leak(Box::new(args)); + + let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new( + args.mgmt_api_endpoint.clone(), + args.pageserver_jwt.as_deref(), + )); + + if let Some(engine_str) = &args.set_io_engine { + mgmt_api_client.put_io_engine(engine_str).await?; + } + + // discover targets + let timelines: Vec = crate::util::cli::targets::discover( + &mgmt_api_client, + crate::util::cli::targets::Spec { + limit_to_first_n_targets: args.limit_to_first_n_targets, + targets: args.targets.clone(), + }, + ) + .await?; + + let token = CancellationToken::new(); + let mut tasks = JoinSet::new(); + + let periodic_stats = Arc::new(LiveStats::default()); + let total_stats = Arc::new(LiveStats::default()); + + let start = Instant::now(); + tasks.spawn({ + let periodic_stats = Arc::clone(&periodic_stats); + let total_stats = Arc::clone(&total_stats); + let cloned_token = token.clone(); + async move { + let mut last_at = Instant::now(); + loop { + if cloned_token.is_cancelled() { + return; + } + tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await; + let now = Instant::now(); + let delta: Duration = now - last_at; + last_at = now; + + let LiveStats { + evictions_count, + downloads_count, + downloads_bytes, + timeline_restarts, + } = &*periodic_stats; + let evictions_count = evictions_count.swap(0, Ordering::Relaxed); + let downloads_count = downloads_count.swap(0, Ordering::Relaxed); + let downloads_bytes = downloads_bytes.swap(0, Ordering::Relaxed); + let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed); + + total_stats.evictions_count.fetch_add(evictions_count, Ordering::Relaxed); + total_stats.downloads_count.fetch_add(downloads_count, Ordering::Relaxed); + total_stats.downloads_bytes.fetch_add(downloads_bytes, Ordering::Relaxed); + total_stats.timeline_restarts.fetch_add(timeline_restarts, Ordering::Relaxed); + + let evictions_per_s = evictions_count as f64 / delta.as_secs_f64(); + let downloads_per_s = downloads_count as f64 / delta.as_secs_f64(); + let downloads_mibs_per_s = downloads_bytes as f64 / delta.as_secs_f64() / ((1 << 20) as f64); + + info!("evictions={evictions_per_s:.2}/s downloads={downloads_per_s:.2}/s download_bytes={downloads_mibs_per_s:.2}MiB/s timeline_restarts={timeline_restarts}"); + } + } + }); + + for tl in timelines { + for _ in 0..args.tasks_per_target.get() { + tasks.spawn(timeline_actor( + args, + Arc::clone(&mgmt_api_client), + tl, + Arc::clone(&periodic_stats), + token.clone(), + )); + } + } + if let Some(runtime) = args.runtime { + tokio::spawn(async move { + tokio::time::sleep(runtime.into()).await; + token.cancel(); + }); + } + + while let Some(res) = tasks.join_next().await { + res.unwrap(); + } + let end = Instant::now(); + let duration: Duration = end - start; + + let output = { + let LiveStats { + evictions_count, + downloads_count, + downloads_bytes, + timeline_restarts, + } = &*total_stats; + Output { + downloads_count: downloads_count.load(Ordering::Relaxed), + downloads_bytes: downloads_bytes.load(Ordering::Relaxed), + evictions_count: evictions_count.load(Ordering::Relaxed), + timeline_restarts: timeline_restarts.load(Ordering::Relaxed), + runtime: duration, + } + }; + let output = serde_json::to_string_pretty(&output).unwrap(); + println!("{output}"); + + Ok(()) +} + +async fn timeline_actor( + args: &'static Args, + mgmt_api_client: Arc, + timeline: TenantTimelineId, + live_stats: Arc, + token: CancellationToken, +) { + // TODO: support sharding + let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); + + struct Timeline { + joinset: JoinSet<()>, + layers: Vec>, + concurrency: Arc, + } + while !token.is_cancelled() { + debug!("restarting timeline"); + let layer_map_info = mgmt_api_client + .layer_map_info(tenant_shard_id, timeline.timeline_id) + .await + .unwrap(); + let concurrency = Arc::new(tokio::sync::Semaphore::new( + args.concurrency_per_target.get(), + )); + + let mut joinset = JoinSet::new(); + let layers = layer_map_info + .historic_layers + .into_iter() + .map(|historic_layer| { + let (tx, rx) = mpsc::channel(1); + joinset.spawn(layer_actor( + tenant_shard_id, + timeline.timeline_id, + historic_layer, + rx, + Arc::clone(&mgmt_api_client), + Arc::clone(&live_stats), + )); + tx + }) + .collect::>(); + + let mut timeline = Timeline { + joinset, + layers, + concurrency, + }; + + live_stats.timeline_restart_done(); + + while !token.is_cancelled() { + assert!(!timeline.joinset.is_empty()); + if let Some(res) = timeline.joinset.try_join_next() { + debug!(?res, "a layer actor exited, should not happen"); + timeline.joinset.shutdown().await; + break; + } + + let mut permit = Some( + Arc::clone(&timeline.concurrency) + .acquire_owned() + .await + .unwrap(), + ); + + loop { + let layer_tx = { + let mut rng = rand::thread_rng(); + timeline.layers.choose_mut(&mut rng).expect("no layers") + }; + match layer_tx.try_send(permit.take().unwrap()) { + Ok(_) => break, + Err(e) => match e { + mpsc::error::TrySendError::Full(back) => { + // TODO: retrying introduces bias away from slow downloaders + permit.replace(back); + } + mpsc::error::TrySendError::Closed(_) => panic!(), + }, + } + } + } + } +} + +async fn layer_actor( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + mut layer: HistoricLayerInfo, + mut rx: mpsc::Receiver, + mgmt_api_client: Arc, + live_stats: Arc, +) { + #[derive(Clone, Copy)] + enum Action { + Evict, + OnDemandDownload, + } + + while let Some(_permit) = rx.recv().await { + let action = if layer.is_remote() { + Action::OnDemandDownload + } else { + Action::Evict + }; + + let did_it = match action { + Action::Evict => { + let did_it = mgmt_api_client + .layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name()) + .await + .unwrap(); + live_stats.eviction_done(); + did_it + } + Action::OnDemandDownload => { + let did_it = mgmt_api_client + .layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name()) + .await + .unwrap(); + live_stats.download_done(layer.layer_file_size()); + did_it + } + }; + if !did_it { + debug!("local copy of layer map appears out of sync, re-downloading"); + return; + } + debug!("did it"); + layer.set_remote(match action { + Action::Evict => true, + Action::OnDemandDownload => false, + }); + } +} diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs index d46ae94e8a..f07beeecfd 100644 --- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -1,9 +1,12 @@ use std::sync::Arc; use humantime::Duration; +use pageserver_api::shard::TenantShardId; use tokio::task::JoinSet; use utils::id::TenantTimelineId; +use pageserver_client::mgmt_api::ForceAwaitLogicalSize; + #[derive(clap::Parser)] pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] @@ -56,14 +59,19 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { for tl in timelines { let mgmt_api_client = Arc::clone(&mgmt_api_client); js.spawn(async move { - // TODO: API to explicitly trigger initial logical size computation. - // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation. - // => https://github.com/neondatabase/neon/issues/6168 let info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); + // Polling should not be strictly required here since we await + // for the initial logical size, however it's possible for the request + // to land before the timeline is initialised. This results in an approximate + // logical size. if let Some(period) = args.poll_for_completion { let mut ticker = tokio::time::interval(period.into()); ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); @@ -71,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { while !info.current_logical_size_is_accurate { ticker.tick().await; info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); } diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs index e0120c9212..5527557450 100644 --- a/pageserver/pagebench/src/main.rs +++ b/pageserver/pagebench/src/main.rs @@ -3,7 +3,6 @@ use utils::logging; /// Re-usable pieces of code that aren't CLI-specific. mod util { - pub(crate) mod connstring; pub(crate) mod request_stats; #[macro_use] pub(crate) mod tokio_thread_local_stats; @@ -15,8 +14,10 @@ mod util { /// The pagebench CLI sub-commands, dispatched in [`main`] below. mod cmd { + pub(super) mod aux_files; pub(super) mod basebackup; pub(super) mod getpage_latest_lsn; + pub(super) mod ondemand_download_churn; pub(super) mod trigger_initial_size_calculation; } @@ -26,6 +27,8 @@ enum Args { Basebackup(cmd::basebackup::Args), GetPageLatestLsn(cmd::getpage_latest_lsn::Args), TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args), + OndemandDownloadChurn(cmd::ondemand_download_churn::Args), + AuxFiles(cmd::aux_files::Args), } fn main() { @@ -35,6 +38,7 @@ fn main() { logging::Output::Stderr, ) .unwrap(); + logging::replace_panic_hook_with_tracing_panic_hook().forget(); let args = Args::parse(); match args { @@ -43,6 +47,8 @@ fn main() { Args::TriggerInitialSizeCalculation(args) => { cmd::trigger_initial_size_calculation::main(args) } + Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args), + Args::AuxFiles(args) => cmd::aux_files::main(args), } .unwrap() } diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs deleted file mode 100644 index 07a0ff042d..0000000000 --- a/pageserver/pagebench/src/util/connstring.rs +++ /dev/null @@ -1,8 +0,0 @@ -pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String { - let colon_and_jwt = if let Some(jwt) = jwt { - format!(":{jwt}") // TODO: urlescape - } else { - String::new() - }; - format!("postgres://postgres{colon_and_jwt}@{host_port}") -} diff --git a/pageserver/pagebench/src/util/request_stats.rs b/pageserver/pagebench/src/util/request_stats.rs index 5ecf1cbf24..4aa6950782 100644 --- a/pageserver/pagebench/src/util/request_stats.rs +++ b/pageserver/pagebench/src/util/request_stats.rs @@ -66,13 +66,10 @@ impl serde::Serialize for LatencyPercentiles { { use serde::ser::SerializeMap; let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?; - for p in LATENCY_PERCENTILES { + for (p, v) in LATENCY_PERCENTILES.iter().zip(&self.latency_percentiles) { ser.serialize_entry( &format!("p{p}"), - &format!( - "{}", - &humantime::format_duration(self.latency_percentiles[0]) - ), + &format!("{}", humantime::format_duration(*v)), )?; } ser.end() diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 2cb661863d..4785c8c4c5 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -14,8 +14,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - (Scope::SafekeeperData, _) => Err(AuthError( - "SafekeeperData scope makes no sense for Pageserver".into(), + (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Pageserver auth", + claims.scope + ) + .into(), )), } } diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs new file mode 100644 index 0000000000..5e527b7d61 --- /dev/null +++ b/pageserver/src/aux_file.rs @@ -0,0 +1,286 @@ +use std::sync::Arc; + +use ::metrics::IntGauge; +use bytes::{Buf, BufMut, Bytes}; +use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; +use tracing::warn; + +// BEGIN Copyright (c) 2017 Servo Contributors + +/// Const version of FNV hash. +#[inline] +#[must_use] +pub const fn fnv_hash(bytes: &[u8]) -> u128 { + const INITIAL_STATE: u128 = 0x6c62272e07bb014262b821756295c58d; + const PRIME: u128 = 0x0000000001000000000000000000013B; + + let mut hash = INITIAL_STATE; + let mut i = 0; + while i < bytes.len() { + hash ^= bytes[i] as u128; + hash = hash.wrapping_mul(PRIME); + i += 1; + } + hash +} + +// END Copyright (c) 2017 Servo Contributors + +/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, least significant 13B of FNV hash]. +fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key { + let mut key: [u8; 16] = [0; METADATA_KEY_SIZE]; + let hash = fnv_hash(data).to_be_bytes(); + key[0] = AUX_KEY_PREFIX; + key[1] = dir_level1; + key[2] = dir_level2; + key[3..16].copy_from_slice(&hash[3..16]); + Key::from_metadata_key_fixed_size(&key) +} + +const AUX_DIR_PG_LOGICAL: u8 = 0x01; +const AUX_DIR_PG_REPLSLOT: u8 = 0x02; +const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; + +/// Encode the aux file into a fixed-size key. +/// +/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type. +/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path +/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix +/// is roughly based on the first two components of the path, one unique number for one component. +/// +/// * pg_logical/mappings -> 0x0101 +/// * pg_logical/snapshots -> 0x0102 +/// * pg_logical/replorigin_checkpoint -> 0x0103 +/// * pg_logical/others -> 0x01FF +/// * pg_replslot/ -> 0x0201 +/// * others -> 0xFFFF +/// +/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. +/// The new file type must have never been written to the storage before. Otherwise, there could be data +/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix. +pub fn encode_aux_file_key(path: &str) -> Key { + if let Some(fname) = path.strip_prefix("pg_logical/mappings/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes()) + } else if path == "pg_logical/replorigin_checkpoint" { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"") + } else if let Some(fname) = path.strip_prefix("pg_logical/") { + if cfg!(debug_assertions) { + warn!( + "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_replslot/") { + aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) + } else { + if cfg!(debug_assertions) { + warn!( + "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes()) + } +} + +const AUX_FILE_ENCODING_VERSION: u8 = 0x01; + +pub fn decode_file_value(val: &[u8]) -> anyhow::Result> { + let mut ptr = val; + if ptr.is_empty() { + // empty value = no files + return Ok(Vec::new()); + } + assert_eq!( + ptr.get_u8(), + AUX_FILE_ENCODING_VERSION, + "unsupported aux file value" + ); + let mut files = vec![]; + while ptr.has_remaining() { + let key_len = ptr.get_u32() as usize; + let key = &ptr[..key_len]; + ptr.advance(key_len); + let val_len = ptr.get_u32() as usize; + let content = &ptr[..val_len]; + ptr.advance(val_len); + + let path = std::str::from_utf8(key)?; + files.push((path, content)); + } + Ok(files) +} + +/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference +/// to the original value slice. Be cautious about memory consumption. +pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result> { + let mut ptr = val.clone(); + if ptr.is_empty() { + // empty value = no files + return Ok(Vec::new()); + } + assert_eq!( + ptr.get_u8(), + AUX_FILE_ENCODING_VERSION, + "unsupported aux file value" + ); + let mut files = vec![]; + while ptr.has_remaining() { + let key_len = ptr.get_u32() as usize; + let key = ptr.slice(..key_len); + ptr.advance(key_len); + let val_len = ptr.get_u32() as usize; + let content = ptr.slice(..val_len); + ptr.advance(val_len); + + let path = std::str::from_utf8(&key)?.to_string(); + files.push((path, content)); + } + Ok(files) +} + +pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result> { + if files.is_empty() { + // no files = empty value + return Ok(Vec::new()); + } + let mut encoded = vec![]; + encoded.put_u8(AUX_FILE_ENCODING_VERSION); + for (path, content) in files { + if path.len() > u32::MAX as usize { + anyhow::bail!("{} exceeds path size limit", path); + } + encoded.put_u32(path.len() as u32); + encoded.put_slice(path.as_bytes()); + if content.len() > u32::MAX as usize { + anyhow::bail!("{} exceeds content size limit", path); + } + encoded.put_u32(content.len() as u32); + encoded.put_slice(content); + } + Ok(encoded) +} + +/// An estimation of the size of aux files. +pub struct AuxFileSizeEstimator { + aux_file_size_gauge: IntGauge, + size: Arc>>, +} + +impl AuxFileSizeEstimator { + pub fn new(aux_file_size_gauge: IntGauge) -> Self { + Self { + aux_file_size_gauge, + size: Arc::new(std::sync::Mutex::new(None)), + } + } + + /// When generating base backup or doing initial logical size calculation + pub fn on_initial(&self, new_size: usize) { + let mut guard = self.size.lock().unwrap(); + *guard = Some(new_size as isize); + self.report(new_size as isize); + } + + pub fn on_add(&self, file_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size += file_size as isize; + self.report(*size); + } + } + + pub fn on_remove(&self, file_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size -= file_size as isize; + self.report(*size); + } + } + + pub fn on_update(&self, old_size: usize, new_size: usize) { + let mut guard = self.size.lock().unwrap(); + if let Some(size) = &mut *guard { + *size += new_size as isize - old_size as isize; + self.report(*size); + } + } + + pub fn report(&self, size: isize) { + self.aux_file_size_gauge.set(size as i64); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hash_portable() { + // AUX file encoding requires the hash to be portable across all platforms. This test case checks + // if the algorithm produces the same hash across different environments. + + assert_eq!( + 265160408618497461376862998434862070044, + super::fnv_hash("test1".as_bytes()) + ); + assert_eq!( + 295486155126299629456360817749600553988, + super::fnv_hash("test/test2".as_bytes()) + ); + assert_eq!( + 144066263297769815596495629667062367629, + super::fnv_hash("".as_bytes()) + ); + } + + #[test] + fn test_encoding_portable() { + // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions + // of the page server. + assert_eq!( + "62000001017F8B83D94F7081693471ABF91C", + encode_aux_file_key("pg_logical/mappings/test1").to_string(), + ); + assert_eq!( + "62000001027F8E83D94F7081693471ABFCCD", + encode_aux_file_key("pg_logical/snapshots/test2").to_string(), + ); + assert_eq!( + "62000001032E07BB014262B821756295C58D", + encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string(), + ); + assert_eq!( + "62000001FF4F38E1C74754E7D03C1A660178", + encode_aux_file_key("pg_logical/unsupported").to_string(), + ); + assert_eq!( + "62000002017F8D83D94F7081693471ABFB92", + encode_aux_file_key("pg_replslot/test3").to_string() + ); + assert_eq!( + "620000FFFF2B6ECC8AEF93F643DC44F15E03", + encode_aux_file_key("other_file_not_supported").to_string(), + ); + } + + #[test] + fn test_value_encoding() { + let files = vec![ + ("pg_logical/1.file", "1111".as_bytes()), + ("pg_logical/2.file", "2222".as_bytes()), + ]; + assert_eq!( + files, + decode_file_value(&encode_file_value(&files).unwrap()).unwrap() + ); + let files = vec![]; + assert_eq!( + files, + decode_file_value(&encode_file_value(&files).unwrap()).unwrap() + ); + } +} diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 7e5ae892ad..0f057a4368 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -10,9 +10,10 @@ //! This module is responsible for creation of such tarball //! from data stored in object storage. //! -use anyhow::{anyhow, bail, ensure, Context}; -use bytes::{BufMut, BytesMut}; +use anyhow::{anyhow, Context}; +use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; +use pageserver_api::key::Key; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; use std::time::SystemTime; @@ -37,6 +38,14 @@ use postgres_ffi::PG_TLI; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; +#[derive(Debug, thiserror::Error)] +pub enum BasebackupError { + #[error("basebackup pageserver error {0:#}")] + Server(#[from] anyhow::Error), + #[error("basebackup client error {0:#}")] + Client(#[source] io::Error), +} + /// Create basebackup with non-rel data in it. /// Only include relational data if 'full_backup' is true. /// @@ -52,7 +61,7 @@ pub async fn send_basebackup_tarball<'a, W>( prev_lsn: Option, full_backup: bool, ctx: &'a RequestContext, -) -> anyhow::Result<()> +) -> Result<(), BasebackupError> where W: AsyncWrite + Send + Sync + Unpin, { @@ -91,8 +100,10 @@ where // Consolidate the derived and the provided prev_lsn values let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { - if backup_prev != Lsn(0) { - ensure!(backup_prev == provided_prev_lsn); + if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn { + return Err(BasebackupError::Server(anyhow!( + "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}" + ))); } provided_prev_lsn } else { @@ -133,13 +144,117 @@ where ctx: &'a RequestContext, } +/// A sink that accepts SLRU blocks ordered by key and forwards +/// full segments to the archive. +struct SlruSegmentsBuilder<'a, 'b, W> +where + W: AsyncWrite + Send + Sync + Unpin, +{ + ar: &'a mut Builder<&'b mut W>, + buf: Vec, + current_segment: Option<(SlruKind, u32)>, + total_blocks: usize, +} + +impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W> +where + W: AsyncWrite + Send + Sync + Unpin, +{ + fn new(ar: &'a mut Builder<&'b mut W>) -> Self { + Self { + ar, + buf: Vec::new(), + current_segment: None, + total_blocks: 0, + } + } + + async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> { + let (kind, segno, _) = key.to_slru_block()?; + + match kind { + SlruKind::Clog => { + if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) { + return Err(BasebackupError::Server(anyhow!( + "invalid SlruKind::Clog record: block.len()={}", + block.len() + ))); + } + } + SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => { + if block.len() != BLCKSZ as usize { + return Err(BasebackupError::Server(anyhow!( + "invalid {:?} record: block.len()={}", + kind, + block.len() + ))); + } + } + } + + let segment = (kind, segno); + match self.current_segment { + None => { + self.current_segment = Some(segment); + self.buf + .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref()); + } + Some(current_seg) if current_seg == segment => { + self.buf + .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref()); + } + Some(_) => { + self.flush().await?; + + self.current_segment = Some(segment); + self.buf + .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref()); + } + } + + Ok(()) + } + + async fn flush(&mut self) -> Result<(), BasebackupError> { + let nblocks = self.buf.len() / BLCKSZ as usize; + let (kind, segno) = self.current_segment.take().unwrap(); + let segname = format!("{}/{:>04X}", kind.to_str(), segno); + let header = new_tar_header(&segname, self.buf.len() as u64)?; + self.ar + .append(&header, self.buf.as_slice()) + .await + .map_err(BasebackupError::Client)?; + + self.total_blocks += nblocks; + debug!("Added to basebackup slru {} relsize {}", segname, nblocks); + + self.buf.clear(); + + Ok(()) + } + + async fn finish(mut self) -> Result<(), BasebackupError> { + let res = if self.current_segment.is_none() || self.buf.is_empty() { + Ok(()) + } else { + self.flush().await + }; + + info!("Collected {} SLRU blocks", self.total_blocks); + + res + } +} + impl<'a, W> Basebackup<'a, W> where W: AsyncWrite + Send + Sync + Unpin, { - async fn send_tarball(mut self) -> anyhow::Result<()> { + async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum + let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; + // Create pgdata subdirs structure for dir in PGDATA_SUBDIRS.iter() { let header = new_tar_header_dir(dir)?; @@ -166,26 +281,42 @@ where .context("could not add config file to basebackup tarball")?; } } - - // Gather non-relational files from object storage pages. - for kind in [ - SlruKind::Clog, - SlruKind::MultiXactOffsets, - SlruKind::MultiXactMembers, - ] { - for segno in self + if !lazy_slru_download { + // Gather non-relational files from object storage pages. + let slru_partitions = self .timeline - .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx) - .await? - { - self.add_slru_segment(kind, segno).await?; + .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? + .partition( + self.timeline.get_shard_identity(), + Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64, + ); + + let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); + + for part in slru_partitions.parts { + let blocks = self + .timeline + .get_vectored(part, self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; + + for (key, block) in blocks { + let block = block.map_err(|e| BasebackupError::Server(e.into()))?; + slru_builder.add_block(&key, block).await?; + } } + slru_builder.finish().await?; } let mut min_restart_lsn: Lsn = Lsn::MAX; // Create tablespace directories - for ((spcnode, dbnode), has_relmap_file) in - self.timeline.list_dbdirs(self.lsn, self.ctx).await? + for ((spcnode, dbnode), has_relmap_file) in self + .timeline + .list_dbdirs(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? { self.add_dbdir(spcnode, dbnode, has_relmap_file).await?; @@ -194,7 +325,8 @@ where let rels = self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; for &rel in rels.iter() { // Send init fork as main fork to provide well formed empty // contents of UNLOGGED relations. Postgres copies it in @@ -217,7 +349,12 @@ where } } - for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? { + for (path, content) in self + .timeline + .list_aux_files(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? + { if path.starts_with("pg_replslot") { let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; let restart_lsn = Lsn(u64::from_le_bytes( @@ -225,6 +362,13 @@ where )); info!("Replication slot {} restart LSN={}", path, restart_lsn); min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn); + } else if path == "pg_logical/replorigin_checkpoint" { + // replorigin_checkoint is written only on compute shutdown, so it contains + // deteriorated values. So we generate our own version of this file for the particular LSN + // based on information about replorigins extracted from transaction commit records. + // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, + // but now we should handle (skip) it for backward compatibility. + continue; } let header = new_tar_header(&path, content.len() as u64)?; self.ar @@ -248,34 +392,67 @@ where for xid in self .timeline .list_twophase_files(self.lsn, self.ctx) - .await? + .await + .map_err(|e| BasebackupError::Server(e.into()))? { self.add_twophase_file(xid).await?; } + let repl_origins = self + .timeline + .get_replorigins(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; + let n_origins = repl_origins.len(); + if n_origins != 0 { + // + // Construct "pg_logical/replorigin_checkpoint" file based on information about replication origins + // extracted from transaction commit record. We are using this file to pass information about replication + // origins to compute to allow logical replication to restart from proper point. + // + let mut content = Vec::with_capacity(n_origins * 16 + 8); + content.extend_from_slice(&pg_constants::REPLICATION_STATE_MAGIC.to_le_bytes()); + for (origin_id, origin_lsn) in repl_origins { + content.extend_from_slice(&origin_id.to_le_bytes()); + content.extend_from_slice(&[0u8; 6]); // align to 8 bytes + content.extend_from_slice(&origin_lsn.0.to_le_bytes()); + } + let crc32 = crc32c::crc32c(&content); + content.extend_from_slice(&crc32.to_le_bytes()); + let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?; + self.ar.append(&header, &*content).await.context( + "could not add pg_logical/replorigin_checkpoint file to basebackup tarball", + )?; + } fail_point!("basebackup-before-control-file", |_| { - bail!("failpoint basebackup-before-control-file") + Err(BasebackupError::Server(anyhow!( + "failpoint basebackup-before-control-file" + ))) }); // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file().await?; - self.ar.finish().await?; + self.ar.finish().await.map_err(BasebackupError::Client)?; debug!("all tarred up!"); Ok(()) } /// Add contents of relfilenode `src`, naming it as `dst`. - async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> { + async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> { let nblocks = self .timeline - .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx) - .await?; + .get_rel_size(src, Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; // If the relation is empty, create an empty file if nblocks == 0 { let file_name = dst.to_segfile_name(0); let header = new_tar_header(&file_name, 0)?; - self.ar.append(&header, &mut io::empty()).await?; + self.ar + .append(&header, &mut io::empty()) + .await + .map_err(BasebackupError::Client)?; return Ok(()); } @@ -289,14 +466,18 @@ where for blknum in startblk..endblk { let img = self .timeline - .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx) - .await?; + .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))?; segment_data.extend_from_slice(&img[..]); } let file_name = dst.to_segfile_name(seg as u32); let header = new_tar_header(&file_name, segment_data.len() as u64)?; - self.ar.append(&header, segment_data.as_slice()).await?; + self.ar + .append(&header, segment_data.as_slice()) + .await + .map_err(BasebackupError::Client)?; seg += 1; startblk = endblk; @@ -305,39 +486,6 @@ where Ok(()) } - // - // Generate SLRU segment files from repository. - // - async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let nblocks = self - .timeline - .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx) - .await?; - - let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * BLCKSZ as usize); - for blknum in 0..nblocks { - let img = self - .timeline - .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx) - .await?; - - if slru == SlruKind::Clog { - ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8); - } else { - ensure!(img.len() == BLCKSZ as usize); - } - - slru_buf.extend_from_slice(&img[..BLCKSZ as usize]); - } - - let segname = format!("{}/{:>04X}", slru.to_str(), segno); - let header = new_tar_header(&segname, slru_buf.len() as u64)?; - self.ar.append(&header, slru_buf.as_slice()).await?; - - trace!("Added to basebackup slru {} relsize {}", segname, nblocks); - Ok(()) - } - // // Include database/tablespace directories. // @@ -349,20 +497,22 @@ where spcnode: u32, dbnode: u32, has_relmap_file: bool, - ) -> anyhow::Result<()> { + ) -> Result<(), BasebackupError> { let relmap_img = if has_relmap_file { let img = self .timeline .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; - ensure!( - img.len() - == dispatch_pgversion!( - self.timeline.pg_version, - pgv::bindings::SIZEOF_RELMAPFILE - ) - ); + if img.len() + != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE) + { + return Err(BasebackupError::Server(anyhow!( + "img.len() != SIZE_OF_RELMAPFILE, img.len()={}", + img.len(), + ))); + } Some(img) } else { @@ -375,14 +525,20 @@ where ver => format!("{ver}\x0A"), }; let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes()).await?; + self.ar + .append(&header, pg_version_str.as_bytes()) + .await + .map_err(BasebackupError::Client)?; info!("timeline.pg_version {}", self.timeline.pg_version); if let Some(img) = relmap_img { // filenode map for global tablespace let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; - self.ar.append(&header, &img[..]).await?; + self.ar + .append(&header, &img[..]) + .await + .map_err(BasebackupError::Client)?; } else { warn!("global/pg_filenode.map is missing"); } @@ -401,18 +557,26 @@ where && self .timeline .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx) - .await? + .await + .map_err(|e| BasebackupError::Server(e.into()))? .is_empty() { return Ok(()); } // User defined tablespaces are not supported - ensure!(spcnode == DEFAULTTABLESPACE_OID); + if spcnode != DEFAULTTABLESPACE_OID { + return Err(BasebackupError::Server(anyhow!( + "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}" + ))); + } // Append dir path for each database let path = format!("base/{}", dbnode); let header = new_tar_header_dir(&path)?; - self.ar.append(&header, &mut io::empty()).await?; + self.ar + .append(&header, &mut io::empty()) + .await + .map_err(BasebackupError::Client)?; if let Some(img) = relmap_img { let dst_path = format!("base/{}/PG_VERSION", dbnode); @@ -422,11 +586,17 @@ where ver => format!("{ver}\x0A"), }; let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; - self.ar.append(&header, pg_version_str.as_bytes()).await?; + self.ar + .append(&header, pg_version_str.as_bytes()) + .await + .map_err(BasebackupError::Client)?; let relmap_path = format!("base/{}/pg_filenode.map", dbnode); let header = new_tar_header(&relmap_path, img.len() as u64)?; - self.ar.append(&header, &img[..]).await?; + self.ar + .append(&header, &img[..]) + .await + .map_err(BasebackupError::Client)?; } }; Ok(()) @@ -435,11 +605,12 @@ where // // Extract twophase state files // - async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { + async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> { let img = self .timeline .get_twophase_file(xid, self.lsn, self.ctx) - .await?; + .await + .map_err(|e| BasebackupError::Server(e.into()))?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -447,7 +618,10 @@ where buf.put_u32_le(crc); let path = format!("pg_twophase/{:>08X}", xid); let header = new_tar_header(&path, buf.len() as u64)?; - self.ar.append(&header, &buf[..]).await?; + self.ar + .append(&header, &buf[..]) + .await + .map_err(BasebackupError::Client)?; Ok(()) } @@ -456,24 +630,28 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { + async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.get_ancestor_lsn() { - write!(zenith_signal, "PREV LSN: none")?; + if self.timeline.is_ancestor_lsn(self.lsn) { + write!(zenith_signal, "PREV LSN: none") + .map_err(|e| BasebackupError::Server(e.into()))?; } else { - write!(zenith_signal, "PREV LSN: invalid")?; + write!(zenith_signal, "PREV LSN: invalid") + .map_err(|e| BasebackupError::Server(e.into()))?; } } else { - write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?; + write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn) + .map_err(|e| BasebackupError::Server(e.into()))?; } self.ar .append( &new_tar_header("zenith.signal", zenith_signal.len() as u64)?, zenith_signal.as_bytes(), ) - .await?; + .await + .map_err(BasebackupError::Client)?; let checkpoint_bytes = self .timeline @@ -495,7 +673,10 @@ where //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; - self.ar.append(&header, &pg_control_bytes[..]).await?; + self.ar + .append(&header, &pg_control_bytes[..]) + .await + .map_err(BasebackupError::Client)?; //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); @@ -510,8 +691,16 @@ where self.lsn, ) .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?; - ensure!(wal_seg.len() == WAL_SEGMENT_SIZE); - self.ar.append(&header, &wal_seg[..]).await?; + if wal_seg.len() != WAL_SEGMENT_SIZE { + return Err(BasebackupError::Server(anyhow!( + "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}", + wal_seg.len() + ))); + } + self.ar + .append(&header, &wal_seg[..]) + .await + .map_err(BasebackupError::Client)?; Ok(()) } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 621ad050f4..ba5b2608bd 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -1,6 +1,9 @@ +#![recursion_limit = "300"] + //! Main entry point for the Page Server executable. use std::env::{var, VarError}; +use std::io::Read; use std::sync::Arc; use std::time::Duration; use std::{env, ops::ControlFlow, str::FromStr}; @@ -16,6 +19,7 @@ use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING}; use pageserver::task_mgr::WALRECEIVER_RUNTIME; use pageserver::tenant::{secondary, TenantSharedResources}; use remote_storage::GenericRemoteStorage; +use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tracing::*; @@ -33,12 +37,10 @@ use pageserver::{ use postgres_backend::AuthType; use utils::failpoint_support; use utils::logging::TracingErrorLayerEnablement; -use utils::signals::ShutdownSignals; use utils::{ auth::{JwtAuth, SwappableJwtAuth}, logging, project_build_tag, project_git_version, sentry_init::init_sentry, - signals::Signal, tcp_listener, }; @@ -120,6 +122,11 @@ fn main() -> anyhow::Result<()> { &[("node_id", &conf.id.to_string())], ); + // after setting up logging, log the effective IO engine choice and read path implementations + info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); + info!(?conf.get_impl, "starting with get page implementation"); + info!(?conf.get_vectored_impl, "starting with vectored get page implementation"); + let tenants_path = conf.tenants_path(); if !tenants_path.exists() { utils::crashsafe::create_dir_all(conf.tenants_path()) @@ -130,7 +137,7 @@ fn main() -> anyhow::Result<()> { let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors); + virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; @@ -145,37 +152,34 @@ fn initialize_config( workdir: &Utf8Path, ) -> anyhow::Result> { let init = arg_matches.get_flag("init"); - let update_config = init || arg_matches.get_flag("update-config"); - let (mut toml, config_file_exists) = if cfg_file_path.is_file() { - if init { - anyhow::bail!( - "Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it", - ); + let file_contents: Option = match std::fs::File::open(cfg_file_path) { + Ok(mut f) => { + if init { + anyhow::bail!("config file already exists: {cfg_file_path}"); + } + let md = f.metadata().context("stat config file")?; + if md.is_file() { + let mut s = String::new(); + f.read_to_string(&mut s).context("read config file")?; + Some(s.parse().context("parse config file toml")?) + } else { + anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}"); + } + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => None, + Err(e) => { + anyhow::bail!("open pageserver config: {e}: {cfg_file_path}"); } - // Supplement the CLI arguments with the config file - let cfg_file_contents = std::fs::read_to_string(cfg_file_path) - .with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?; - ( - cfg_file_contents - .parse::() - .with_context(|| { - format!("Failed to parse '{cfg_file_path}' as pageserver config") - })?, - true, - ) - } else if cfg_file_path.exists() { - anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file"); - } else { - // We're initializing the tenant, so there's no config file yet - ( - DEFAULT_CONFIG_FILE - .parse::() - .context("could not parse built-in config file")?, - false, - ) }; + let mut effective_config = file_contents.unwrap_or_else(|| { + DEFAULT_CONFIG_FILE + .parse() + .expect("unit tests ensure this works") + }); + + // Patch with overrides from the command line if let Some(values) = arg_matches.get_many::("config-override") { for option_line in values { let doc = toml_edit::Document::from_str(option_line).with_context(|| { @@ -183,22 +187,21 @@ fn initialize_config( })?; for (key, item) in doc.iter() { - if config_file_exists && update_config && key == "id" && toml.contains_key(key) { - anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden"); - } - toml.insert(key, item.clone()); + effective_config.insert(key, item.clone()); } } } - debug!("Resulting toml: {toml}"); - let conf = PageServerConf::parse_and_validate(&toml, workdir) + debug!("Resulting toml: {effective_config}"); + + // Construct the runtime representation + let conf = PageServerConf::parse_and_validate(&effective_config, workdir) .context("Failed to parse pageserver configuration")?; - if update_config { + if init { info!("Writing pageserver config to '{cfg_file_path}'"); - std::fs::write(cfg_file_path, toml.to_string()) + std::fs::write(cfg_file_path, effective_config.to_string()) .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?; info!("Config successfully written to '{cfg_file_path}'") } @@ -274,6 +277,12 @@ fn start_pageserver( ); set_build_info_metric(GIT_VERSION, BUILD_TAG); set_launch_timestamp_metric(launch_ts); + #[cfg(target_os = "linux")] + metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap(); + metrics::register_internal(Box::new( + pageserver::metrics::tokio_epoll_uring::Collector::new(), + )) + .unwrap(); pageserver::preinitialize_metrics(); // If any failpoints were set from FAILPOINTS environment variable, @@ -308,6 +317,7 @@ fn start_pageserver( let http_listener = tcp_listener::bind(http_addr)?; let pg_addr = &conf.listen_pg_addr; + info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; @@ -505,16 +515,12 @@ fn start_pageserver( } }); - let secondary_controller = if let Some(remote_storage) = &remote_storage { - secondary::spawn_tasks( - tenant_manager.clone(), - remote_storage.clone(), - background_jobs_barrier.clone(), - shutdown_pageserver.clone(), - ) - } else { - secondary::null_controller() - }; + let secondary_controller = secondary::spawn_tasks( + tenant_manager.clone(), + remote_storage.clone(), + background_jobs_barrier.clone(), + shutdown_pageserver.clone(), + ); // shared state between the disk-usage backed eviction background task and the http endpoint // that allows triggering disk-usage based eviction manually. note that the http endpoint @@ -522,14 +528,13 @@ fn start_pageserver( // been configured. let disk_usage_eviction_state: Arc = Arc::default(); - if let Some(remote_storage) = &remote_storage { - launch_disk_usage_global_eviction_task( - conf, - remote_storage.clone(), - disk_usage_eviction_state.clone(), - background_jobs_barrier.clone(), - )?; - } + launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + tenant_manager.clone(), + background_jobs_barrier.clone(), + )?; // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. @@ -539,7 +544,7 @@ fn start_pageserver( let router_state = Arc::new( http::routes::State::new( conf, - tenant_manager, + tenant_manager.clone(), http_auth.clone(), remote_storage.clone(), broker_client.clone(), @@ -589,32 +594,37 @@ fn start_pageserver( None, "consumption metrics collection", true, - async move { - // first wait until background jobs are cleared to launch. - // - // this is because we only process active tenants and timelines, and the - // Timeline::get_current_logical_size will spawn the logical size calculation, - // which will not be rate-limited. - let cancel = task_mgr::shutdown_token(); + { + let tenant_manager = tenant_manager.clone(); + async move { + // first wait until background jobs are cleared to launch. + // + // this is because we only process active tenants and timelines, and the + // Timeline::get_current_logical_size will spawn the logical size calculation, + // which will not be rate-limited. + let cancel = task_mgr::shutdown_token(); - tokio::select! { - _ = cancel.cancelled() => { return Ok(()); }, - _ = background_jobs_barrier.wait() => {} - }; + tokio::select! { + _ = cancel.cancelled() => { return Ok(()); }, + _ = background_jobs_barrier.wait() => {} + }; - pageserver::consumption_metrics::collect_metrics( - metric_collection_endpoint, - conf.metric_collection_interval, - conf.cached_metric_collection_interval, - conf.synthetic_size_calculation_interval, - conf.id, - local_disk_storage, - cancel, - metrics_ctx, - ) - .instrument(info_span!("metrics_collection")) - .await?; - Ok(()) + pageserver::consumption_metrics::collect_metrics( + tenant_manager, + metric_collection_endpoint, + &conf.metric_collection_bucket, + conf.metric_collection_interval, + conf.cached_metric_collection_interval, + conf.synthetic_size_calculation_interval, + conf.id, + local_disk_storage, + cancel, + metrics_ctx, + ) + .instrument(info_span!("metrics_collection")) + .await?; + Ok(()) + } }, ); } @@ -637,17 +647,20 @@ fn start_pageserver( None, "libpq endpoint listener", true, - async move { - page_service::libpq_listener_main( - conf, - broker_client, - pg_auth, - pageserver_listener, - conf.pg_auth_type, - libpq_ctx, - task_mgr::shutdown_token(), - ) - .await + { + let tenant_manager = tenant_manager.clone(); + async move { + page_service::libpq_listener_main( + tenant_manager, + broker_client, + pg_auth, + pageserver_listener, + conf.pg_auth_type, + libpq_ctx, + task_mgr::shutdown_token(), + ) + .await + } }, ); } @@ -655,44 +668,40 @@ fn start_pageserver( let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); // All started up! Now just sit and wait for shutdown signal. - ShutdownSignals::handle(|signal| match signal { - Signal::Quit => { - info!( - "Got {}. Terminating in immediate shutdown mode", - signal.name() - ); - std::process::exit(111); - } - Signal::Interrupt | Signal::Terminate => { - info!( - "Got {}. Terminating gracefully in fast shutdown mode", - signal.name() - ); + { + BACKGROUND_RUNTIME.block_on(async move { + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); + std::process::exit(111); + } + _ = sigint.recv() => { "SIGINT" }, + _ = sigterm.recv() => { "SIGTERM" }, + }; + + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); // This cancels the `shutdown_pageserver` cancellation tree. // Right now that tree doesn't reach very far, and `task_mgr` is used instead. // The plan is to change that over time. shutdown_pageserver.take(); - let bg_remote_storage = remote_storage.clone(); - let bg_deletion_queue = deletion_queue.clone(); - BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver( - bg_remote_storage.map(|_| bg_deletion_queue), - 0, - )); + pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await; unreachable!() - } - }) + }) + } } fn create_remote_storage_client( conf: &'static PageServerConf, -) -> anyhow::Result> { +) -> anyhow::Result { let config = if let Some(config) = &conf.remote_storage_config { config } else { - tracing::warn!("no remote storage configured, this is a deprecated configuration"); - return Ok(None); + anyhow::bail!("no remote storage configured, this is a deprecated configuration"); }; // Create the client @@ -712,7 +721,7 @@ fn create_remote_storage_client( GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures); } - Ok(Some(remote_storage)) + Ok(remote_storage) } fn cli() -> Command { @@ -734,18 +743,13 @@ fn cli() -> Command { // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") + .long("config-override") .short('c') .num_args(1) .action(ArgAction::Append) .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \ Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) - .arg( - Arg::new("update-config") - .long("update-config") - .action(ArgAction::SetTrue) - .help("Update the config file when started"), - ) .arg( Arg::new("enabled-features") .long("enabled-features") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 7c03dc1bdd..b4a0d1ac02 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -7,6 +7,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use pageserver_api::shard::TenantShardId; use remote_storage::{RemotePath, RemoteStorageConfig}; +use serde; use serde::de::IntoDeserializer; use std::env; use storage_broker::Uri; @@ -20,7 +21,6 @@ use std::num::NonZeroUsize; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use toml_edit; use toml_edit::{Document, Item}; use camino::{Utf8Path, Utf8PathBuf}; @@ -30,24 +30,28 @@ use utils::{ logging::LogFormat, }; -use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; -use crate::tenant::config::TenantConf; -use crate::tenant::config::TenantConfOpt; +use crate::tenant::timeline::GetVectoredImpl; +use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; +use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{ TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, }; +use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; +use crate::{tenant::config::TenantConf, virtual_file}; use crate::{ - IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, - TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX, + IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, + TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, }; use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; +use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE; + pub mod defaults { use crate::tenant::config::defaults::*; use const_format::formatcp; - pub use pageserver_api::{ + pub use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_PG_LISTEN_PORT, }; @@ -79,6 +83,22 @@ pub mod defaults { pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; + #[cfg(target_os = "linux")] + pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring"; + + #[cfg(not(target_os = "linux"))] + pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; + + pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential"; + + pub const DEFAULT_GET_IMPL: &str = "legacy"; + + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; + + pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; + /// /// Default built-in configuration file. /// @@ -114,6 +134,16 @@ pub mod defaults { #ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE} +#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' + +#get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}' + +#get_impl = '{DEFAULT_GET_IMPL}' + +#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' + +#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' + [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -128,11 +158,12 @@ pub mod defaults { #min_resident_size_override = .. # in bytes #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' -#gc_feedback = false #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} #secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY} +#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB} + [remote_storage] "# @@ -192,9 +223,9 @@ pub struct PageServerConf { pub log_format: LogFormat, - /// Number of tenants which will be concurrently loaded from remote storage proactively on startup, - /// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes - /// loading such tenants, vs. other work in the system. + /// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach. + /// + /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system. pub concurrent_tenant_warmup: ConfigurableSemaphore, /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed. @@ -211,6 +242,7 @@ pub struct PageServerConf { // How often to send unchanged cached metrics to the metrics endpoint. pub cached_metric_collection_interval: Duration, pub metric_collection_endpoint: Option, + pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, pub disk_usage_based_eviction: Option, @@ -247,6 +279,23 @@ pub struct PageServerConf { /// Maximum number of WAL records to be ingested and committed at the same time pub ingest_batch_size: u64, + + pub virtual_file_io_engine: virtual_file::IoEngineKind, + + pub get_vectored_impl: GetVectoredImpl, + + pub get_impl: GetImpl, + + pub max_vectored_read_bytes: MaxVectoredReadBytes, + + pub validate_vectored_get: bool, + + /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this + /// is exceeded, we start proactively closing ephemeral layers to limit the total amount + /// of ephemeral data. + /// + /// Setting this to zero disables limits on total ephemeral layer size. + pub ephemeral_bytes_per_memory_kb: usize, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -259,21 +308,29 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); // use dedicated enum for builder to better indicate the intention // and avoid possible confusion with nested options +#[derive(Clone, Default)] pub enum BuilderValue { Set(T), + #[default] NotSet, } -impl BuilderValue { - pub fn ok_or(self, err: E) -> Result { +impl BuilderValue { + pub fn ok_or(&self, field_name: &'static str, default: BuilderValue) -> anyhow::Result { match self { - Self::Set(v) => Ok(v), - Self::NotSet => Err(err), + Self::Set(v) => Ok(v.clone()), + Self::NotSet => match default { + BuilderValue::Set(v) => Ok(v.clone()), + BuilderValue::NotSet => { + anyhow::bail!("missing config value {field_name:?}") + } + }, } } } // needed to simplify config construction +#[derive(Default)] struct PageServerConfigBuilder { listen_pg_addr: BuilderValue, @@ -314,6 +371,7 @@ struct PageServerConfigBuilder { cached_metric_collection_interval: BuilderValue, metric_collection_endpoint: BuilderValue>, synthetic_size_calculation_interval: BuilderValue, + metric_collection_bucket: BuilderValue>, disk_usage_based_eviction: BuilderValue>, @@ -331,10 +389,23 @@ struct PageServerConfigBuilder { secondary_download_concurrency: BuilderValue, ingest_batch_size: BuilderValue, + + virtual_file_io_engine: BuilderValue, + + get_vectored_impl: BuilderValue, + + get_impl: BuilderValue, + + max_vectored_read_bytes: BuilderValue, + + validate_vectored_get: BuilderValue, + + ephemeral_bytes_per_memory_kb: BuilderValue, } -impl Default for PageServerConfigBuilder { - fn default() -> Self { +impl PageServerConfigBuilder { + #[inline(always)] + fn default_values() -> Self { use self::BuilderValue::*; use defaults::*; Self { @@ -387,6 +458,8 @@ impl Default for PageServerConfigBuilder { .expect("cannot parse default synthetic size calculation interval")), metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), + metric_collection_bucket: Set(None), + disk_usage_based_eviction: Set(None), test_remote_failures: Set(0), @@ -406,6 +479,16 @@ impl Default for PageServerConfigBuilder { secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), + + virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), + + get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()), + get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()), + max_vectored_read_bytes: Set(MaxVectoredReadBytes( + NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), + )), + validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), + ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), } } } @@ -510,6 +593,13 @@ impl PageServerConfigBuilder { self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) } + pub fn metric_collection_bucket( + &mut self, + metric_collection_bucket: Option, + ) { + self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket) + } + pub fn synthetic_size_calculation_interval( &mut self, synthetic_size_calculation_interval: Duration, @@ -562,114 +652,124 @@ impl PageServerConfigBuilder { self.ingest_batch_size = BuilderValue::Set(ingest_batch_size) } + pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) { + self.virtual_file_io_engine = BuilderValue::Set(value); + } + + pub fn get_vectored_impl(&mut self, value: GetVectoredImpl) { + self.get_vectored_impl = BuilderValue::Set(value); + } + + pub fn get_impl(&mut self, value: GetImpl) { + self.get_impl = BuilderValue::Set(value); + } + + pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { + self.max_vectored_read_bytes = BuilderValue::Set(value); + } + + pub fn get_validate_vectored_get(&mut self, value: bool) { + self.validate_vectored_get = BuilderValue::Set(value); + } + + pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) { + self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { - let concurrent_tenant_warmup = self - .concurrent_tenant_warmup - .ok_or(anyhow!("missing concurrent_tenant_warmup"))?; - let concurrent_tenant_size_logical_size_queries = self - .concurrent_tenant_size_logical_size_queries - .ok_or(anyhow!( - "missing concurrent_tenant_size_logical_size_queries" - ))?; - Ok(PageServerConf { - listen_pg_addr: self - .listen_pg_addr - .ok_or(anyhow!("missing listen_pg_addr"))?, - listen_http_addr: self - .listen_http_addr - .ok_or(anyhow!("missing listen_http_addr"))?, - availability_zone: self - .availability_zone - .ok_or(anyhow!("missing availability_zone"))?, - wait_lsn_timeout: self - .wait_lsn_timeout - .ok_or(anyhow!("missing wait_lsn_timeout"))?, - wal_redo_timeout: self - .wal_redo_timeout - .ok_or(anyhow!("missing wal_redo_timeout"))?, - superuser: self.superuser.ok_or(anyhow!("missing superuser"))?, - page_cache_size: self - .page_cache_size - .ok_or(anyhow!("missing page_cache_size"))?, - max_file_descriptors: self - .max_file_descriptors - .ok_or(anyhow!("missing max_file_descriptors"))?, - workdir: self.workdir.ok_or(anyhow!("missing workdir"))?, - pg_distrib_dir: self - .pg_distrib_dir - .ok_or(anyhow!("missing pg_distrib_dir"))?, - http_auth_type: self - .http_auth_type - .ok_or(anyhow!("missing http_auth_type"))?, - pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?, - auth_validation_public_key_path: self - .auth_validation_public_key_path - .ok_or(anyhow!("missing auth_validation_public_key_path"))?, - remote_storage_config: self - .remote_storage_config - .ok_or(anyhow!("missing remote_storage_config"))?, - id: self.id.ok_or(anyhow!("missing id"))?, - // TenantConf is handled separately - default_tenant_conf: TenantConf::default(), - broker_endpoint: self - .broker_endpoint - .ok_or(anyhow!("No broker endpoints provided"))?, - broker_keepalive_interval: self - .broker_keepalive_interval - .ok_or(anyhow!("No broker keepalive interval provided"))?, - log_format: self.log_format.ok_or(anyhow!("missing log_format"))?, - concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( - concurrent_tenant_size_logical_size_queries, - ), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( - concurrent_tenant_size_logical_size_queries, - ), - metric_collection_interval: self - .metric_collection_interval - .ok_or(anyhow!("missing metric_collection_interval"))?, - cached_metric_collection_interval: self - .cached_metric_collection_interval - .ok_or(anyhow!("missing cached_metric_collection_interval"))?, - metric_collection_endpoint: self - .metric_collection_endpoint - .ok_or(anyhow!("missing metric_collection_endpoint"))?, - synthetic_size_calculation_interval: self - .synthetic_size_calculation_interval - .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?, - disk_usage_based_eviction: self - .disk_usage_based_eviction - .ok_or(anyhow!("missing disk_usage_based_eviction"))?, - test_remote_failures: self - .test_remote_failures - .ok_or(anyhow!("missing test_remote_failuers"))?, - ondemand_download_behavior_treat_error_as_warn: self - .ondemand_download_behavior_treat_error_as_warn - .ok_or(anyhow!( - "missing ondemand_download_behavior_treat_error_as_warn" - ))?, - background_task_maximum_delay: self - .background_task_maximum_delay - .ok_or(anyhow!("missing background_task_maximum_delay"))?, - control_plane_api: self - .control_plane_api - .ok_or(anyhow!("missing control_plane_api"))?, - control_plane_api_token: self - .control_plane_api_token - .ok_or(anyhow!("missing control_plane_api_token"))?, - control_plane_emergency_mode: self - .control_plane_emergency_mode - .ok_or(anyhow!("missing control_plane_emergency_mode"))?, - heatmap_upload_concurrency: self - .heatmap_upload_concurrency - .ok_or(anyhow!("missing heatmap_upload_concurrency"))?, - secondary_download_concurrency: self - .secondary_download_concurrency - .ok_or(anyhow!("missing secondary_download_concurrency"))?, - ingest_batch_size: self - .ingest_batch_size - .ok_or(anyhow!("missing ingest_batch_size"))?, - }) + let default = Self::default_values(); + + macro_rules! conf { + (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => { + PageServerConf { + $( + $field: self.$field.ok_or(stringify!($field), default.$field)?, + )* + $( + $custom_field: $custom_value, + )* + } + }; + } + + Ok(conf!( + USING DEFAULT + { + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + wal_redo_timeout, + superuser, + page_cache_size, + max_file_descriptors, + workdir, + pg_distrib_dir, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage_config, + id, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + cached_metric_collection_interval, + metric_collection_endpoint, + metric_collection_bucket, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_api_token, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + get_vectored_impl, + get_impl, + max_vectored_read_bytes, + validate_vectored_get, + ephemeral_bytes_per_memory_kb, + } + CUSTOM LOGIC + { + // TenantConf is handled separately + default_tenant_conf: TenantConf::default(), + concurrent_tenant_warmup: ConfigurableSemaphore::new({ + self + .concurrent_tenant_warmup + .ok_or("concurrent_tenant_warmpup", + default.concurrent_tenant_warmup)? + }), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( + self + .concurrent_tenant_size_logical_size_queries + .ok_or("concurrent_tenant_size_logical_size_queries", + default.concurrent_tenant_size_logical_size_queries.clone())? + ), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( + // re-use `concurrent_tenant_size_logical_size_queries` + self + .concurrent_tenant_size_logical_size_queries + .ok_or("eviction_task_immitated_concurrent_logical_size_queries", + default.concurrent_tenant_size_logical_size_queries.clone())?, + ), + virtual_file_io_engine: match self.virtual_file_io_engine { + BuilderValue::Set(v) => v, + BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? { + io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise + io_engine::FeatureTestResult::Worse { engine, remark } => { + // TODO: bubble this up to the caller so we can tracing::warn! it. + eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + engine + } + }, + }, + } + )) } } @@ -686,6 +786,10 @@ impl PageServerConf { self.workdir.join("deletion") } + pub fn metadata_path(&self) -> Utf8PathBuf { + self.workdir.join("metadata.json") + } + pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf { // Encode a version in the filename, so that if we ever switch away from JSON we can // increment this. @@ -745,18 +849,7 @@ impl PageServerConf { .join(timeline_id.to_string()) } - pub fn timeline_uninit_mark_file_path( - &self, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - ) -> Utf8PathBuf { - path_with_suffix_extension( - self.timeline_path(&tenant_shard_id, &timeline_id), - TIMELINE_UNINIT_MARK_SUFFIX, - ) - } - - pub fn timeline_delete_mark_file_path( + pub(crate) fn timeline_delete_mark_file_path( &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, @@ -767,7 +860,10 @@ impl PageServerConf { ) } - pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf { + pub(crate) fn tenant_deleted_mark_file_path( + &self, + tenant_shard_id: &TenantShardId, + ) -> Utf8PathBuf { self.tenant_path(tenant_shard_id) .join(TENANT_DELETED_MARKER_FILE_NAME) } @@ -788,17 +884,6 @@ impl PageServerConf { .join(connection_id.to_string()) } - /// Points to a place in pageserver's local directory, - /// where certain timeline's metadata file should be located. - pub fn metadata_path( - &self, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - ) -> Utf8PathBuf { - self.timeline_path(tenant_shard_id, timeline_id) - .join(METADATA_FILE_NAME) - } - /// Turns storage remote path of a file into its local path. pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf { remote_path.with_base(&self.workdir) @@ -882,6 +967,9 @@ impl PageServerConf { let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; builder.metric_collection_endpoint(Some(endpoint)); }, + "metric_collection_bucket" => { + builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?) + } "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), @@ -920,6 +1008,27 @@ impl PageServerConf { builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize) }, "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?), + "virtual_file_io_engine" => { + builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) + } + "get_vectored_impl" => { + builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?) + } + "get_impl" => { + builder.get_impl(parse_toml_from_str("get_impl", item)?) + } + "max_vectored_read_bytes" => { + let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; + builder.get_max_vectored_read_bytes( + MaxVectoredReadBytes( + NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0"))) + } + "validate_vectored_get" => { + builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?) + } + "ephemeral_bytes_per_memory_kb" => { + builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -982,6 +1091,7 @@ impl PageServerConf { metric_collection_interval: Duration::from_secs(60), cached_metric_collection_interval: Duration::from_secs(60 * 60), metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(60), disk_usage_based_eviction: None, test_remote_failures: 0, @@ -993,6 +1103,15 @@ impl PageServerConf { heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, + virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), + get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), + max_vectored_read_bytes: MaxVectoredReadBytes( + NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) + .expect("Invalid default constant"), + ), + validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, } } } @@ -1120,17 +1239,15 @@ impl ConfigurableSemaphore { #[cfg(test)] mod tests { - use std::{ - fs, - num::{NonZeroU32, NonZeroUsize}, - }; + use std::{fs, num::NonZeroU32}; use camino_tempfile::{tempdir, Utf8TempDir}; + use pageserver_api::models::EvictionPolicy; use remote_storage::{RemoteStorageKind, S3Config}; use utils::serde_percent::Percent; use super::*; - use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION}; + use crate::DEFAULT_PG_VERSION; const ALL_BASE_VALUES_TOML: &str = r#" # Initial configuration file created by 'pageserver --init' @@ -1209,6 +1326,7 @@ background_task_maximum_delay = '334 s' defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL )?, metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, + metric_collection_bucket: None, synthetic_size_calculation_interval: humantime::parse_duration( defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL )?, @@ -1224,6 +1342,15 @@ background_task_maximum_delay = '334 s' heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, + virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), + get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), + max_vectored_read_bytes: MaxVectoredReadBytes( + NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) + .expect("Invalid default constant") + ), + validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, }, "Correct defaults should be used when no config values are provided" ); @@ -1276,6 +1403,7 @@ background_task_maximum_delay = '334 s' metric_collection_interval: Duration::from_secs(222), cached_metric_collection_interval: Duration::from_secs(22200), metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), + metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(333), disk_usage_based_eviction: None, test_remote_failures: 0, @@ -1287,6 +1415,15 @@ background_task_maximum_delay = '334 s' heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, ingest_batch_size: 100, + virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), + get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(), + max_vectored_read_bytes: MaxVectoredReadBytes( + NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) + .expect("Invalid default constant") + ), + validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, }, "Should be able to parse all basic config values correctly" ); @@ -1332,6 +1469,7 @@ broker_endpoint = '{broker_endpoint}' parsed_remote_storage_config, RemoteStorageConfig { storage: RemoteStorageKind::LocalFs(local_storage_path.clone()), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }, "Remote storage config should correctly parse the local FS config and fill other storage defaults" ); @@ -1398,7 +1536,9 @@ broker_endpoint = '{broker_endpoint}' endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, max_keys_per_list_response: None, + upload_storage_class: None, }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }, "Remote storage config should correctly parse the S3 config" ); @@ -1519,17 +1659,50 @@ threshold = "20m" eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, }) ); + match &conf.default_tenant_conf.eviction_policy { - EvictionPolicy::NoEviction => panic!("Unexpected eviction opolicy tenant settings"), - EvictionPolicy::LayerAccessThreshold(eviction_thresold) => { - assert_eq!(eviction_thresold.period, Duration::from_secs(20 * 60)); - assert_eq!(eviction_thresold.threshold, Duration::from_secs(20 * 60)); + EvictionPolicy::LayerAccessThreshold(eviction_threshold) => { + assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60)); + assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60)); } + other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), } Ok(()) } + #[test] + fn parse_imitation_only_pageserver_config() { + let tempdir = tempdir().unwrap(); + let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap(); + + let pageserver_conf_toml = format!( + r#"pg_distrib_dir = "{pg_distrib_dir}" +metric_collection_endpoint = "http://sample.url" +metric_collection_interval = "10min" +id = 222 + +[tenant_config] +evictions_low_residence_duration_metric_threshold = "20m" + +[tenant_config.eviction_policy] +kind = "OnlyImitiate" +period = "20m" +threshold = "20m" +"#, + ); + let toml: Document = pageserver_conf_toml.parse().unwrap(); + let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap(); + + match &conf.default_tenant_conf.eviction_policy { + EvictionPolicy::OnlyImitiate(t) => { + assert_eq!(t.period, Duration::from_secs(20 * 60)); + assert_eq!(t.threshold, Duration::from_secs(20 * 60)); + } + other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), + } + } + fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { let tempdir_path = tempdir.path(); diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index bde2cedca7..18c1a6cd9b 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -2,11 +2,13 @@ //! and push them to a HTTP endpoint. use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::size::CalculateSyntheticSizeError; use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant}; +use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant}; use camino::Utf8PathBuf; use consumption_metrics::EventType; use pageserver_api::models::TenantState; +use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; use std::collections::HashMap; use std::sync::Arc; @@ -17,7 +19,7 @@ use tracing::*; use utils::id::NodeId; mod metrics; -use metrics::MetricsKey; +use crate::consumption_metrics::metrics::MetricsKey; mod disk_cache; mod upload; @@ -40,7 +42,9 @@ type Cache = HashMap; /// Main thread that serves metrics collection #[allow(clippy::too_many_arguments)] pub async fn collect_metrics( + tenant_manager: Arc, metric_collection_endpoint: &Url, + metric_collection_bucket: &Option, metric_collection_interval: Duration, _cached_metric_collection_interval: Duration, synthetic_size_calculation_interval: Duration, @@ -65,15 +69,19 @@ pub async fn collect_metrics( None, "synthetic size calculation", false, - async move { - calculate_synthetic_size_worker( - synthetic_size_calculation_interval, - &cancel, - &worker_ctx, - ) - .instrument(info_span!("synthetic_size_worker")) - .await?; - Ok(()) + { + let tenant_manager = tenant_manager.clone(); + async move { + calculate_synthetic_size_worker( + tenant_manager, + synthetic_size_calculation_interval, + &cancel, + &worker_ctx, + ) + .instrument(info_span!("synthetic_size_worker")) + .await?; + Ok(()) + } }, ); @@ -94,13 +102,27 @@ pub async fn collect_metrics( .build() .expect("Failed to create http client with timeout"); + let bucket_client = if let Some(bucket_config) = metric_collection_bucket { + match GenericRemoteStorage::from_config(bucket_config) { + Ok(client) => Some(client), + Err(e) => { + // Non-fatal error: if we were given an invalid config, we will proceed + // with sending metrics over the network, but not to S3. + tracing::warn!("Invalid configuration for metric_collection_bucket: {e}"); + None + } + } + } else { + None + }; + let node_id = node_id.to_string(); loop { let started_at = Instant::now(); // these are point in time, with variable "now" - let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await; + let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await; let metrics = Arc::new(metrics); @@ -118,10 +140,18 @@ pub async fn collect_metrics( tracing::error!("failed to persist metrics to {path:?}: {e:#}"); } } + + if let Some(bucket_client) = &bucket_client { + let res = + upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await; + if let Err(e) = res { + tracing::error!("failed to upload to S3: {e:#}"); + } + } }; let upload = async { - let res = upload::upload_metrics( + let res = upload::upload_metrics_http( &client, metric_collection_endpoint, &cancel, @@ -132,7 +162,7 @@ pub async fn collect_metrics( .await; if let Err(e) = res { // serialization error which should never happen - tracing::error!("failed to upload due to {e:#}"); + tracing::error!("failed to upload via HTTP due to {e:#}"); } }; @@ -247,6 +277,7 @@ async fn reschedule( /// Caclculate synthetic size for each active tenant async fn calculate_synthetic_size_worker( + tenant_manager: Arc, synthetic_size_calculation_interval: Duration, cancel: &CancellationToken, ctx: &RequestContext, @@ -259,7 +290,7 @@ async fn calculate_synthetic_size_worker( loop { let started_at = Instant::now(); - let tenants = match mgr::list_tenants().await { + let tenants = match tenant_manager.list_tenants() { Ok(tenants) => tenants, Err(e) => { warn!("cannot get tenant list: {e:#}"); @@ -267,21 +298,25 @@ async fn calculate_synthetic_size_worker( } }; - for (tenant_shard_id, tenant_state) in tenants { + for (tenant_shard_id, tenant_state, _gen) in tenants { if tenant_state != TenantState::Active { continue; } - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // We only send consumption metrics from shard 0, so don't waste time calculating // synthetic size on other shards. continue; } - let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else { + let Ok(tenant) = tenant_manager.get_attached_tenant_shard(tenant_shard_id) else { continue; }; + if !tenant.is_active() { + continue; + } + // there is never any reason to exit calculate_synthetic_size_worker following any // return value -- we don't need to care about shutdown because no tenant is found when // pageserver is shut down. @@ -314,21 +349,12 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re // Same for the loop that fetches computed metrics. // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, // which turns out is really handy to understand the system. - let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else { - return; - }; - - // this error can be returned if timeline is shutting down, but it does not - // mean the synthetic size worker should terminate. we do not need any checks - // in this function because `mgr::get_tenant` will error out after shutdown has - // progressed to shutting down tenants. - let shutting_down = matches!( - e.downcast_ref::(), - Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_)) - ); - - if !shutting_down { - let tenant_shard_id = tenant.tenant_shard_id(); - error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"); + match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await { + Ok(_) => {} + Err(CalculateSyntheticSizeError::Cancelled) => {} + Err(e) => { + let tenant_shard_id = tenant.tenant_shard_id(); + error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}"); + } } } diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 0b827816bc..7ba2d04c4f 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -1,3 +1,4 @@ +use crate::tenant::mgr::TenantManager; use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize}; use chrono::{DateTime, Utc}; use consumption_metrics::EventType; @@ -181,6 +182,7 @@ impl MetricsKey { } pub(super) async fn collect_all_metrics( + tenant_manager: &Arc, cached_metrics: &Cache, ctx: &RequestContext, ) -> Vec { @@ -188,7 +190,7 @@ pub(super) async fn collect_all_metrics( let started_at = std::time::Instant::now(); - let tenants = match crate::tenant::mgr::list_tenants().await { + let tenants = match tenant_manager.list_tenants() { Ok(tenants) => tenants, Err(err) => { tracing::error!("failed to list tenants: {:?}", err); @@ -196,11 +198,12 @@ pub(super) async fn collect_all_metrics( } }; - let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move { - if state != TenantState::Active || !id.is_zero() { + let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move { + if state != TenantState::Active || !id.is_shard_zero() { None } else { - crate::tenant::mgr::get_tenant(id, true) + tenant_manager + .get_attached_tenant_shard(id) .ok() .map(|tenant| (id.tenant_id, tenant)) } diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs index 38a4c9eb5d..f9cbcea565 100644 --- a/pageserver/src/consumption_metrics/metrics/tests.rs +++ b/pageserver/src/consumption_metrics/metrics/tests.rs @@ -1,7 +1,5 @@ use super::*; use std::collections::HashMap; -use std::time::SystemTime; -use utils::lsn::Lsn; #[test] fn startup_collected_timeline_metrics_before_advancing() { diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 322ed95cc8..4e8283c3e4 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -1,4 +1,9 @@ +use std::time::SystemTime; + +use chrono::{DateTime, Utc}; use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE}; +use remote_storage::{GenericRemoteStorage, RemotePath}; +use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::Instrument; @@ -13,8 +18,9 @@ struct Ids { pub(super) timeline_id: Option, } +/// Serialize and write metrics to an HTTP endpoint #[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] -pub(super) async fn upload_metrics( +pub(super) async fn upload_metrics_http( client: &reqwest::Client, metric_collection_endpoint: &reqwest::Url, cancel: &CancellationToken, @@ -74,6 +80,60 @@ pub(super) async fn upload_metrics( Ok(()) } +/// Serialize and write metrics to a remote storage object +#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))] +pub(super) async fn upload_metrics_bucket( + client: &GenericRemoteStorage, + cancel: &CancellationToken, + node_id: &str, + metrics: &[RawMetric], +) -> anyhow::Result<()> { + if metrics.is_empty() { + // Skip uploads if we have no metrics, so that readers don't have to handle the edge case + // of an empty object. + return Ok(()); + } + + // Compose object path + let datetime: DateTime = SystemTime::now().into(); + let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ"); + let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?; + + // Set up a gzip writer into a buffer + let mut compressed_bytes: Vec = Vec::new(); + let compressed_writer = std::io::Cursor::new(&mut compressed_bytes); + let mut gzip_writer = async_compression::tokio::write::GzipEncoder::new(compressed_writer); + + // Serialize and write into compressed buffer + let started_at = std::time::Instant::now(); + for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) { + let (_chunk, body) = res?; + gzip_writer.write_all(&body).await?; + } + gzip_writer.flush().await?; + gzip_writer.shutdown().await?; + let compressed_length = compressed_bytes.len(); + + // Write to remote storage + client + .upload_storage_object( + futures::stream::once(futures::future::ready(Ok(compressed_bytes.into()))), + compressed_length, + &path, + cancel, + ) + .await?; + let elapsed = started_at.elapsed(); + + tracing::info!( + compressed_length, + elapsed_ms = elapsed.as_millis(), + "write metrics bucket at {path}", + ); + + Ok(()) +} + // The return type is quite ugly, but we gain testability in isolation fn serialize_in_chunks<'a, F>( chunk_size: usize, @@ -262,35 +322,33 @@ async fn upload( ) -> Result<(), UploadError> { let warn_after = 3; let max_attempts = 10; + + // this is used only with tests so far + let last_value = if is_last { "true" } else { "false" }; + let res = utils::backoff::retry( - move || { - let body = body.clone(); - async move { - let res = client - .post(metric_collection_endpoint.clone()) - .header(reqwest::header::CONTENT_TYPE, "application/json") - .header( - LAST_IN_BATCH.clone(), - if is_last { "true" } else { "false" }, - ) - .body(body) - .send() - .await; + || async { + let res = client + .post(metric_collection_endpoint.clone()) + .header(reqwest::header::CONTENT_TYPE, "application/json") + .header(LAST_IN_BATCH.clone(), last_value) + .body(body.clone()) + .send() + .await; - let res = res.and_then(|res| res.error_for_status()); + let res = res.and_then(|res| res.error_for_status()); - // 10 redirects are normally allowed, so we don't need worry about 3xx - match res { - Ok(_response) => Ok(()), - Err(e) => { - let status = e.status().filter(|s| s.is_client_error()); - if let Some(status) = status { - // rejection used to be a thing when the server could reject a - // whole batch of metrics if one metric was bad. - Err(UploadError::Rejected(status)) - } else { - Err(UploadError::Reqwest(e)) - } + // 10 redirects are normally allowed, so we don't need worry about 3xx + match res { + Ok(_response) => Ok(()), + Err(e) => { + let status = e.status().filter(|s| s.is_client_error()); + if let Some(status) = status { + // rejection used to be a thing when the server could reject a + // whole batch of metrics if one metric was bad. + Err(UploadError::Rejected(status)) + } else { + Err(UploadError::Reqwest(e)) } } } @@ -299,9 +357,11 @@ async fn upload( warn_after, max_attempts, "upload consumption_metrics", - utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled), + cancel, ) - .await; + .await + .ok_or_else(|| UploadError::Cancelled) + .and_then(|x| x); match &res { Ok(_) => {} diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index ee331ea154..86d0390c30 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -88,13 +88,16 @@ use crate::task_mgr::TaskKind; +pub(crate) mod optional_counter; + // The main structure of this module, see module-level comment. -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct RequestContext { task_kind: TaskKind, download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, + pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32, } /// The kind of access to the page cache. @@ -150,6 +153,7 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, + micros_spent_throttled: Default::default(), }, } } @@ -163,6 +167,7 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, + micros_spent_throttled: Default::default(), }, } } diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs new file mode 100644 index 0000000000..100c649f18 --- /dev/null +++ b/pageserver/src/context/optional_counter.rs @@ -0,0 +1,101 @@ +use std::{ + sync::atomic::{AtomicU32, Ordering}, + time::Duration, +}; + +#[derive(Debug)] +pub struct CounterU32 { + inner: AtomicU32, +} +impl Default for CounterU32 { + fn default() -> Self { + Self { + inner: AtomicU32::new(u32::MAX), + } + } +} +impl CounterU32 { + pub fn open(&self) -> Result<(), &'static str> { + match self + .inner + .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed) + { + Ok(_) => Ok(()), + Err(_) => Err("open() called on clsoed state"), + } + } + pub fn close(&self) -> Result { + match self.inner.swap(u32::MAX, Ordering::Relaxed) { + u32::MAX => Err("close() called on closed state"), + x => Ok(x), + } + } + + pub fn add(&self, count: u32) -> Result<(), &'static str> { + if count == 0 { + return Ok(()); + } + let mut had_err = None; + self.inner + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur { + u32::MAX => { + had_err = Some("add() called on closed state"); + None + } + x => { + let (new, overflowed) = x.overflowing_add(count); + if new == u32::MAX || overflowed { + had_err = Some("add() overflowed the counter"); + None + } else { + Some(new) + } + } + }) + .map_err(|_| had_err.expect("we set it whenever the function returns None")) + .map(|_| ()) + } +} + +#[derive(Default, Debug)] +pub struct MicroSecondsCounterU32 { + inner: CounterU32, +} + +impl MicroSecondsCounterU32 { + pub fn open(&self) -> Result<(), &'static str> { + self.inner.open() + } + pub fn add(&self, duration: Duration) -> Result<(), &'static str> { + match duration.as_micros().try_into() { + Ok(x) => self.inner.add(x), + Err(_) => Err("add(): duration conversion error"), + } + } + pub fn close_and_checked_sub_from(&self, from: Duration) -> Result { + let val = self.inner.close()?; + let val = Duration::from_micros(val as u64); + let subbed = match from.checked_sub(val) { + Some(v) => v, + None => return Err("Duration::checked_sub"), + }; + Ok(subbed) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_basic() { + let counter = MicroSecondsCounterU32::default(); + counter.open().unwrap(); + counter.add(Duration::from_micros(23)).unwrap(); + let res = counter + .close_and_checked_sub_from(Duration::from_micros(42)) + .unwrap(); + assert_eq!(res, Duration::from_micros(42 - 23)); + } +} diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 25ae3d1b01..26e7cc7ef8 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -1,17 +1,21 @@ use std::collections::HashMap; +use futures::Future; use pageserver_api::{ - control_api::{ - ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse, - }, + controller_api::NodeRegisterRequest, shard::TenantShardId, + upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateRequestTenant, ValidateResponse, + }, }; use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, generation::Generation, id::NodeId}; +use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; -use crate::config::PageServerConf; +use crate::{config::PageServerConf, virtual_file::on_fatal_io_error}; +use pageserver_api::config::NodeMetadata; /// The Pageserver's client for using the control plane API: this is a small subset /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md) @@ -28,13 +32,17 @@ pub enum RetryForeverError { ShuttingDown, } -#[async_trait::async_trait] pub trait ControlPlaneGenerationsApi { - async fn re_attach(&self) -> Result, RetryForeverError>; - async fn validate( + fn re_attach( + &self, + conf: &PageServerConf, + ) -> impl Future< + Output = Result, RetryForeverError>, + > + Send; + fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, - ) -> Result, RetryForeverError>; + ) -> impl Future, RetryForeverError>> + Send; } impl ControlPlaneClient { @@ -55,7 +63,7 @@ impl ControlPlaneClient { let mut client = reqwest::ClientBuilder::new(); if let Some(jwt) = &conf.control_plane_api_token { - let mut headers = hyper::HeaderMap::new(); + let mut headers = reqwest::header::HeaderMap::new(); headers.insert( "Authorization", format!("Bearer {}", jwt.get_contents()).parse().unwrap(), @@ -80,59 +88,87 @@ impl ControlPlaneClient { R: Serialize, T: DeserializeOwned, { - #[derive(thiserror::Error, Debug)] - enum RemoteAttemptError { - #[error("shutdown")] - Shutdown, - #[error("remote: {0}")] - Remote(reqwest::Error), - } - - match backoff::retry( + let res = backoff::retry( || async { let response = self .http_client .post(url.clone()) .json(&request) .send() - .await - .map_err(RemoteAttemptError::Remote)?; + .await?; - response - .error_for_status_ref() - .map_err(RemoteAttemptError::Remote)?; - response - .json::() - .await - .map_err(RemoteAttemptError::Remote) + response.error_for_status_ref()?; + response.json::().await }, |_| false, 3, u32::MAX, "calling control plane generation validation API", - backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown), + &self.cancel, ) .await - { - Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown), - Err(RemoteAttemptError::Remote(_)) => { - panic!("We retry forever, this should never be reached"); - } - Ok(r) => Ok(r), - } + .ok_or(RetryForeverError::ShuttingDown)? + .expect("We retry forever, this should never be reached"); + + Ok(res) } } -#[async_trait::async_trait] impl ControlPlaneGenerationsApi for ControlPlaneClient { /// Block until we get a successful response, or error out if we are shut down - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach( + &self, + conf: &PageServerConf, + ) -> Result, RetryForeverError> { let re_attach_path = self .base_url .join("re-attach") .expect("Failed to build re-attach path"); + + // Include registration content in the re-attach request if a metadata file is readable + let metadata_path = conf.metadata_path(); + let register = match tokio::fs::read_to_string(&metadata_path).await { + Ok(metadata_str) => match serde_json::from_str::(&metadata_str) { + Ok(m) => { + // Since we run one time at startup, be generous in our logging and + // dump all metadata. + tracing::info!( + "Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}", + m.postgres_host, + m.postgres_port, + m.http_host, + m.http_port, + m.other + ); + + Some(NodeRegisterRequest { + node_id: conf.id, + listen_pg_addr: m.postgres_host, + listen_pg_port: m.postgres_port, + listen_http_addr: m.http_host, + listen_http_port: m.http_port, + }) + } + Err(e) => { + tracing::error!("Unreadable metadata in {metadata_path}: {e}"); + None + } + }, + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + // This is legal: we may have been deployed with some external script + // doing registration for us. + tracing::info!("Metadata file not found at {metadata_path}"); + } else { + on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}")) + } + None + } + }; + let request = ReAttachRequest { node_id: self.node_id, + register, }; fail::fail_point!("control-plane-client-re-attach"); @@ -146,7 +182,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { Ok(response .tenants .into_iter() - .map(|t| (t.id, Generation::new(t.gen))) + .map(|rart| (rart.id, rart)) .collect::>()) } @@ -172,7 +208,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { .collect(), }; - fail::fail_point!("control-plane-client-validate"); + failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel); + if self.cancel.is_cancelled() { + return Err(RetryForeverError::ShuttingDown); + } let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 7b05745483..3960fc1b99 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -20,10 +20,9 @@ use remote_storage::{GenericRemoteStorage, RemotePath}; use serde::Deserialize; use serde::Serialize; use thiserror::Error; -use tokio; use tokio_util::sync::CancellationToken; use tracing::Instrument; -use tracing::{self, debug, error}; +use tracing::{debug, error}; use utils::crashsafe::path_with_suffix_extension; use utils::generation::Generation; use utils::id::TimelineId; @@ -39,7 +38,7 @@ use deleter::DeleterMessage; use list_writer::ListWriterQueueMessage; use validator::ValidatorQueueMessage; -use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName}; +use crate::{config::PageServerConf, tenant::storage_layer::LayerName}; // TODO: configurable for how long to wait before executing deletions @@ -234,7 +233,7 @@ impl DeletionHeader { let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?; let header_path = conf.deletion_header_path(); let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX); - VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes) + VirtualFile::crashsafe_overwrite(header_path, temp_path, header_bytes) .await .maybe_fatal_err("save deletion header")?; @@ -312,7 +311,7 @@ impl DeletionList { result.extend( timeline_layers .into_iter() - .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))), + .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))), ); } } @@ -325,7 +324,8 @@ impl DeletionList { let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX); let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list"); - VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes) + + VirtualFile::crashsafe_overwrite(path, temp_path, bytes) .await .maybe_fatal_err("save deletion list") .map_err(Into::into) @@ -479,7 +479,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { if current_generation.is_none() { debug!("Enqueuing deletions in legacy mode, skipping queue"); @@ -511,7 +511,7 @@ impl DeletionQueueClient { tenant_shard_id: TenantShardId, timeline_id: TimelineId, current_generation: Generation, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> Result<(), DeletionQueueError> { metrics::DELETION_QUEUE .keys_submitted @@ -632,7 +632,7 @@ impl DeletionQueue { /// /// If remote_storage is None, then the returned workers will also be None. pub fn new( - remote_storage: Option, + remote_storage: GenericRemoteStorage, control_plane_client: Option, conf: &'static PageServerConf, ) -> (Self, Option>) @@ -658,23 +658,6 @@ impl DeletionQueue { // longer to flush after Tenants have all been torn down. let cancel = CancellationToken::new(); - let remote_storage = match remote_storage { - None => { - return ( - Self { - client: DeletionQueueClient { - tx, - executor_tx, - lsn_table: lsn_table.clone(), - }, - cancel, - }, - None, - ) - } - Some(r) => r, - }; - ( Self { client: DeletionQueueClient { @@ -700,8 +683,6 @@ impl DeletionQueue { } pub async fn shutdown(&mut self, timeout: Duration) { - self.cancel.cancel(); - match tokio::time::timeout(timeout, self.client.flush()).await { Ok(Ok(())) => { tracing::info!("Deletion queue flushed successfully on shutdown") @@ -715,6 +696,10 @@ impl DeletionQueue { tracing::warn!("Timed out flushing deletion queue on shutdown") } } + + // We only cancel _after_ flushing: otherwise we would be shutting down the + // components that do the flush. + self.cancel.cancel(); } } @@ -722,7 +707,7 @@ impl DeletionQueue { mod test { use camino::Utf8Path; use hex_literal::hex; - use pageserver_api::shard::ShardIndex; + use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant}; use std::{io::ErrorKind, time::Duration}; use tracing::info; @@ -732,23 +717,20 @@ mod test { use crate::{ control_plane_client::RetryForeverError, repository::Key, - tenant::{ - harness::TenantHarness, remote_timeline_client::remote_timeline_path, - storage_layer::DeltaFileName, - }, + tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; use super::*; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); - pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51), }); // When you need a second layer in a test. - pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName { + pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName { key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF), lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61), }); @@ -766,7 +748,7 @@ mod test { /// Simulate a pageserver restart by destroying and recreating the deletion queue async fn restart(&mut self) { let (deletion_queue, workers) = DeletionQueue::new( - Some(self.storage.clone()), + self.storage.clone(), Some(self.mock_control_plane.clone()), self.harness.conf, ); @@ -798,7 +780,7 @@ mod test { /// Returns remote layer file name, suitable for use in assert_remote_files fn write_remote_layer( &self, - file_name: LayerFileName, + file_name: LayerName, gen: Generation, ) -> anyhow::Result { let tenant_shard_id = self.harness.tenant_shard_id; @@ -831,12 +813,14 @@ mod test { } } - #[async_trait::async_trait] impl ControlPlaneGenerationsApi for MockControlPlane { - #[allow(clippy::diverging_sub_expression)] // False positive via async_trait - async fn re_attach(&self) -> Result, RetryForeverError> { + async fn re_attach( + &self, + _conf: &PageServerConf, + ) -> Result, RetryForeverError> { unimplemented!() } + async fn validate( &self, tenants: Vec<(TenantShardId, Generation)>, @@ -867,13 +851,14 @@ mod test { let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?; let storage_config = RemoteStorageConfig { storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; let storage = GenericRemoteStorage::from_config(&storage_config).unwrap(); let mock_control_plane = MockControlPlane::new(); let (deletion_queue, worker) = DeletionQueue::new( - Some(storage.clone()), + storage.clone(), Some(mock_control_plane.clone()), harness.conf, ); @@ -950,7 +935,7 @@ mod test { let client = ctx.deletion_queue.new_client(); client.recover(HashMap::new())?; - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); let tenant_shard_id = ctx.harness.tenant_shard_id; let content: Vec = "victim1 contents".into(); @@ -1159,17 +1144,13 @@ mod test { pub(crate) mod mock { use tracing::info; - use crate::tenant::remote_timeline_client::remote_layer_path; - use super::*; - use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }; + use std::sync::atomic::{AtomicUsize, Ordering}; pub struct ConsumerState { rx: tokio::sync::mpsc::UnboundedReceiver, executor_rx: tokio::sync::mpsc::Receiver, + cancel: CancellationToken, } impl ConsumerState { @@ -1183,7 +1164,7 @@ pub(crate) mod mock { match msg { DeleterMessage::Delete(objects) => { for path in objects { - match remote_storage.delete(&path).await { + match remote_storage.delete(&path, &self.cancel).await { Ok(_) => { debug!("Deleted {path}"); } @@ -1216,7 +1197,7 @@ pub(crate) mod mock { for path in objects { info!("Executing deletion {path}"); - match remote_storage.delete(&path).await { + match remote_storage.delete(&path, &self.cancel).await { Ok(_) => { debug!("Deleted {path}"); } @@ -1266,7 +1247,11 @@ pub(crate) mod mock { executor_tx, executed, remote_storage, - consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }), + consumer: std::sync::Mutex::new(ConsumerState { + rx, + executor_rx, + cancel: CancellationToken::new(), + }), lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())), } } diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index 57421b1547..1f04bc0410 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -8,6 +8,7 @@ use remote_storage::GenericRemoteStorage; use remote_storage::RemotePath; +use remote_storage::TimeoutOrCancel; use remote_storage::MAX_KEYS_PER_DELETE; use std::time::Duration; use tokio_util::sync::CancellationToken; @@ -71,15 +72,19 @@ impl Deleter { Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute")) }); - self.remote_storage.delete_objects(&self.accumulator).await + self.remote_storage + .delete_objects(&self.accumulator, &self.cancel) + .await }, - |_| false, + TimeoutOrCancel::caused_by_cancel, 3, 10, "executing deletion batch", - backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")), + &self.cancel, ) .await + .ok_or_else(|| anyhow::anyhow!("Shutting down")) + .and_then(|x| x) } /// Block until everything in accumulator has been executed diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 3a3d600ac2..ae3b2c9180 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::LayerFileMetadata; -use crate::tenant::storage_layer::LayerFileName; +use crate::tenant::storage_layer::LayerName; use crate::virtual_file::on_fatal_io_error; use crate::virtual_file::MaybeFatalIo; @@ -59,7 +59,7 @@ pub(super) struct DeletionOp { // `layers` and `objects` are both just lists of objects. `layers` is used if you do not // have a config object handy to project it to a remote key, and need the consuming worker // to do it for you. - pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(super) layers: Vec<(LayerName, LayerFileMetadata)>, pub(super) objects: Vec, /// The _current_ generation of the Tenant shard attachment in which we are enqueuing diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 23b9b573b6..90bd4294bb 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -47,22 +47,24 @@ use std::{ }; use anyhow::Context; -use camino::Utf8Path; +use pageserver_api::shard::TenantShardId; use remote_storage::GenericRemoteStorage; use serde::{Deserialize, Serialize}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::completion; use utils::serde_percent::Percent; +use utils::{completion, id::TimelineId}; use crate::{ config::PageServerConf, + metrics::disk_usage_based_eviction::METRICS, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ - self, - storage_layer::{AsLayerDesc, EvictionError, Layer}, - Timeline, + mgr::TenantManager, + remote_timeline_client::LayerFileMetadata, + secondary::SecondaryTenant, + storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName}, }, }; @@ -94,23 +96,86 @@ pub enum EvictionOrder { /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. - /// - /// This strategy will evict layers more fairly but is untested. RelativeAccessed { - #[serde(default)] + /// Determines if the tenant with most layers should lose first. + /// + /// Having this enabled is currently the only reasonable option, because the order in which + /// we read tenants is deterministic. If we find the need to use this as `false`, we need + /// to ensure nondeterminism by adding in a random number to break the + /// `relative_last_activity==0.0` ties. + #[serde(default = "default_highest_layer_count_loses_first")] highest_layer_count_loses_first: bool, }, } +fn default_highest_layer_count_loses_first() -> bool { + true +} + impl EvictionOrder { - /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer - /// counts should be the first ones to have their layers evicted. - fn highest_layer_count_loses_first(&self) -> bool { + fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) { + use EvictionOrder::*; + match self { - EvictionOrder::AbsoluteAccessed => false, - EvictionOrder::RelativeAccessed { + AbsoluteAccessed => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.last_activity_ts) + }); + } + RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.relative_last_activity) + }), + } + } + + /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants + /// layers in **most** recently used order. + fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 { + use EvictionOrder::*; + + match self { + AbsoluteAccessed => finite_f32::FiniteF32::ZERO, + RelativeAccessed { highest_layer_count_loses_first, - } => *highest_layer_count_loses_first, + } => { + // keeping the -1 or not decides if every tenant should lose their least recently accessed + // layer OR if this should happen in the order of having highest layer count: + let fudge = if *highest_layer_count_loses_first { + // relative_last_activity vs. tenant layer count: + // - 0.1..=1.0 (10 layers) + // - 0.01..=1.0 (100 layers) + // - 0.001..=1.0 (1000 layers) + // + // leading to evicting less of the smallest tenants. + 0 + } else { + // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a + // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could + // be that less than 10k layer evictions is enough, so we would not need to evict from + // all tenants. + // + // as the tenant ordering is now deterministic this could hit the same tenants + // disproportionetly on multiple invocations. alternative could be to remember how many + // layers did we evict last time from this tenant, and inject that as an additional + // fudge here. + 1 + }; + + let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1); + let divider = total as f32; + + // most recently used is always (total - 0) / divider == 1.0 + // least recently used depends on the fudge: + // - (total - 1) - (total - 1) / total => 0 / total + // - total - (total - 1) / total => 1 / total + let distance = (total - index) as f32; + + finite_f32::FiniteF32::try_from_normalized(distance / divider) + .unwrap_or_else(|val| { + tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}"); + finite_f32::FiniteF32::ZERO + }) + } } } } @@ -125,6 +190,7 @@ pub fn launch_disk_usage_global_eviction_task( conf: &'static PageServerConf, storage: GenericRemoteStorage, state: Arc, + tenant_manager: Arc, background_jobs_barrier: completion::Barrier, ) -> anyhow::Result<()> { let Some(task_config) = &conf.disk_usage_based_eviction else { @@ -150,8 +216,7 @@ pub fn launch_disk_usage_global_eviction_task( _ = background_jobs_barrier.wait() => { } }; - disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel) - .await; + disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await; Ok(()) }, ); @@ -164,7 +229,7 @@ async fn disk_usage_eviction_task( state: &State, task_config: &DiskUsageEvictionTaskConfig, storage: &GenericRemoteStorage, - tenants_dir: &Utf8Path, + tenant_manager: Arc, cancel: CancellationToken, ) { scopeguard::defer! { @@ -191,7 +256,7 @@ async fn disk_usage_eviction_task( state, task_config, storage, - tenants_dir, + &tenant_manager, &cancel, ) .await; @@ -226,15 +291,17 @@ async fn disk_usage_eviction_task_iteration( state: &State, task_config: &DiskUsageEvictionTaskConfig, storage: &GenericRemoteStorage, - tenants_dir: &Utf8Path, + tenant_manager: &Arc, cancel: &CancellationToken, ) -> anyhow::Result<()> { - let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) + let tenants_dir = tenant_manager.get_conf().tenants_path(); + let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config) .context("get filesystem-level disk usage before evictions")?; let res = disk_usage_eviction_task_iteration_impl( state, storage, usage_pre, + tenant_manager, task_config.eviction_order, cancel, ) @@ -248,7 +315,7 @@ async fn disk_usage_eviction_task_iteration( } IterationOutcome::Finished(outcome) => { // Verify with statvfs whether we made any real progress - let after = filesystem_level_usage::get(tenants_dir, task_config) + let after = filesystem_level_usage::get(&tenants_dir, task_config) // It's quite unlikely to hit the error here. Keep the code simple and bail out. .context("get filesystem-level disk usage after evictions")?; @@ -283,7 +350,6 @@ pub enum IterationOutcome { Finished(IterationOutcomeFinished), } -#[allow(dead_code)] #[derive(Debug, Serialize)] pub struct IterationOutcomeFinished { /// The actual usage observed before we started the iteration. @@ -298,7 +364,6 @@ pub struct IterationOutcomeFinished { } #[derive(Debug, Serialize)] -#[allow(dead_code)] struct AssumedUsage { /// The expected value for `after`, after phase 2. projected_after: U, @@ -306,14 +371,12 @@ struct AssumedUsage { failed: LayerCount, } -#[allow(dead_code)] #[derive(Debug, Serialize)] struct PlannedUsage { respecting_tenant_min_resident_size: U, fallback_to_global_lru: Option, } -#[allow(dead_code)] #[derive(Debug, Default, Serialize)] struct LayerCount { file_sizes: u64, @@ -324,6 +387,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( state: &State, _storage: &GenericRemoteStorage, usage_pre: U, + tenant_manager: &Arc, eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result> { @@ -344,29 +408,39 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - let candidates = match collect_eviction_candidates(eviction_order, cancel).await? { - EvictionCandidates::Cancelled => { - return Ok(IterationOutcome::Cancelled); + let (candidates, collection_time) = { + let started_at = std::time::Instant::now(); + match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? { + EvictionCandidates::Cancelled => { + return Ok(IterationOutcome::Cancelled); + } + EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()), } - EvictionCandidates::Finished(partitioned) => partitioned, }; + METRICS.layers_collected.inc_by(candidates.len() as u64); + + tracing::info!( + elapsed_ms = collection_time.as_millis(), + total_layers = candidates.len(), + "collection completed" + ); + // Debug-log the list of candidates let now = SystemTime::now(); for (i, (partition, candidate)) in candidates.iter().enumerate() { let nth = i + 1; - let desc = candidate.layer.layer_desc(); let total_candidates = candidates.len(); - let size = desc.file_size; + let size = candidate.layer.get_file_size(); let rel = candidate.relative_last_activity; debug!( "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}", now.duration_since(candidate.last_activity_ts) .unwrap() .as_micros(), - desc.tenant_shard_id, - desc.timeline_id, - candidate.layer, + candidate.layer.get_tenant_shard_id(), + candidate.layer.get_timeline_id(), + candidate.layer.get_name(), ); } @@ -380,39 +454,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // If we get far enough in the list that we start to evict layers that are below // the tenant's min-resident-size threshold, print a warning, and memorize the disk // usage at that point, in 'usage_planned_min_resident_size_respecting'. - let mut warned = None; - let mut usage_planned = usage_pre; - let mut evicted_amount = 0; - for (i, (partition, candidate)) in candidates.iter().enumerate() { - if !usage_planned.has_pressure() { - debug!( - no_candidates_evicted = i, - "took enough candidates for pressure to be relieved" - ); - break; - } + let (evicted_amount, usage_planned) = + select_victims(&candidates, usage_pre).into_amount_and_planned(); - if partition == &MinResidentSizePartition::Below && warned.is_none() { - warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); - warned = Some(usage_planned); - } - - usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size); - evicted_amount += 1; - } - - let usage_planned = match warned { - Some(respecting_tenant_min_resident_size) => PlannedUsage { - respecting_tenant_min_resident_size, - fallback_to_global_lru: Some(usage_planned), - }, - None => PlannedUsage { - respecting_tenant_min_resident_size: usage_planned, - fallback_to_global_lru: None, - }, - }; - debug!(?usage_planned, "usage planned"); + METRICS.layers_selected.inc_by(evicted_amount as u64); // phase2: evict layers @@ -441,9 +487,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( if let Some(next) = next { match next { Ok(Ok(file_size)) => { + METRICS.layers_evicted.inc(); usage_assumed.add_available_bytes(file_size); } - Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => { + Ok(Err(( + file_size, + EvictionError::NotFound + | EvictionError::Downloaded + | EvictionError::Timeout, + ))) => { evictions_failed.file_sizes += file_size; evictions_failed.count += 1; } @@ -459,29 +511,70 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // calling again when consumed_all is fine as evicted is fused. let Some((_partition, candidate)) = evicted.next() else { - consumed_all = true; + if !consumed_all { + tracing::info!("all evictions started, waiting"); + consumed_all = true; + } continue; }; - js.spawn(async move { - let rtc = candidate.timeline.remote_client.as_ref().expect( - "holding the witness, all timelines must have a remote timeline client", - ); - let file_size = candidate.layer.layer_desc().file_size; - candidate - .layer - .evict_and_wait(rtc) - .await - .map(|()| file_size) - .map_err(|e| (file_size, e)) - }); + match candidate.layer { + EvictionLayer::Attached(layer) => { + let file_size = layer.layer_desc().file_size; + js.spawn(async move { + // have a low eviction waiting timeout because our LRU calculations go stale fast; + // also individual layer evictions could hang because of bugs and we do not want to + // pause disk_usage_based_eviction for such. + let timeout = std::time::Duration::from_secs(5); + match layer.evict_and_wait(timeout).await { + Ok(()) => Ok(file_size), + Err(e) => Err((file_size, e)), + } + }); + } + EvictionLayer::Secondary(layer) => { + let file_size = layer.metadata.file_size; + + js.spawn(async move { + layer + .secondary_tenant + .evict_layer(layer.timeline_id, layer.name) + .await; + Ok(file_size) + }); + } + } tokio::task::yield_now().await; } (usage_assumed, evictions_failed) }; + let started_at = std::time::Instant::now(); + + let evict_layers = async move { + let mut evict_layers = std::pin::pin!(evict_layers); + + let maximum_expected = std::time::Duration::from_secs(10); + + let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await; + let tuple = if let Ok(tuple) = res { + tuple + } else { + let elapsed = started_at.elapsed(); + tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing"); + evict_layers.await + }; + + let elapsed = started_at.elapsed(); + tracing::info!(elapsed_ms = elapsed.as_millis(), "completed"); + tuple + }; + + let evict_layers = + evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount)); + let (usage_assumed, evictions_failed) = tokio::select! { tuple = evict_layers => { tuple }, _ = cancel.cancelled() => { @@ -502,11 +595,100 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( } #[derive(Clone)] -struct EvictionCandidate { - timeline: Arc, - layer: Layer, - last_activity_ts: SystemTime, - relative_last_activity: finite_f32::FiniteF32, +pub(crate) struct EvictionSecondaryLayer { + pub(crate) secondary_tenant: Arc, + pub(crate) timeline_id: TimelineId, + pub(crate) name: LayerName, + pub(crate) metadata: LayerFileMetadata, +} + +/// Full [`Layer`] objects are specific to tenants in attached mode. This type is a layer +/// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name. +#[derive(Clone)] +pub(crate) enum EvictionLayer { + Attached(Layer), + Secondary(EvictionSecondaryLayer), +} + +impl From for EvictionLayer { + fn from(value: Layer) -> Self { + Self::Attached(value) + } +} + +impl EvictionLayer { + pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId { + match self { + Self::Attached(l) => &l.layer_desc().tenant_shard_id, + Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(), + } + } + + pub(crate) fn get_timeline_id(&self) -> &TimelineId { + match self { + Self::Attached(l) => &l.layer_desc().timeline_id, + Self::Secondary(sl) => &sl.timeline_id, + } + } + + pub(crate) fn get_name(&self) -> LayerName { + match self { + Self::Attached(l) => l.layer_desc().layer_name(), + Self::Secondary(sl) => sl.name.clone(), + } + } + + pub(crate) fn get_file_size(&self) -> u64 { + match self { + Self::Attached(l) => l.layer_desc().file_size, + Self::Secondary(sl) => sl.metadata.file_size, + } + } +} + +#[derive(Clone)] +pub(crate) struct EvictionCandidate { + pub(crate) layer: EvictionLayer, + pub(crate) last_activity_ts: SystemTime, + pub(crate) relative_last_activity: finite_f32::FiniteF32, +} + +impl std::fmt::Display for EvictionLayer { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Self::Attached(l) => l.fmt(f), + Self::Secondary(sl) => { + write!(f, "{}/{}", sl.timeline_id, sl.name) + } + } + } +} + +#[derive(Default)] +pub(crate) struct DiskUsageEvictionInfo { + /// Timeline's largest layer (remote or resident) + pub max_layer_size: Option, + /// Timeline's resident layers + pub resident_layers: Vec, +} + +impl std::fmt::Debug for EvictionCandidate { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it + // having to allocate a string to this is bad, but it will rarely be formatted + let ts = chrono::DateTime::::from(self.last_activity_ts); + let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true); + struct DisplayIsDebug<'a, T>(&'a T); + impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } + } + f.debug_struct("LocalLayerInfoForDiskUsageEviction") + .field("layer", &DisplayIsDebug(&self.layer)) + .field("last_activity", &ts) + .finish() + } } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] @@ -623,22 +805,32 @@ enum EvictionCandidates { /// - tenant B 1 layer /// - tenant C 8 layers async fn collect_eviction_candidates( + tenant_manager: &Arc, eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result { + const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10); + // get a snapshot of the list of tenants - let tenants = tenant::mgr::list_tenants() - .await + let tenants = tenant_manager + .list_tenants() .context("get list of tenants")?; + // TODO: avoid listing every layer in every tenant: this loop can block the executor, + // and the resulting data structure can be huge. + // (https://github.com/neondatabase/neon/issues/6224) let mut candidates = Vec::new(); - for (tenant_id, _state) in &tenants { + for (tenant_id, _state, _gen) in tenants { if cancel.is_cancelled() { return Ok(EvictionCandidates::Cancelled); } - let tenant = match tenant::mgr::get_tenant(*tenant_id, true) { - Ok(tenant) => tenant, + let tenant = match tenant_manager.get_attached_tenant_shard(tenant_id) { + Ok(tenant) if tenant.is_active() => tenant, + Ok(_) => { + debug!(tenant_id=%tenant_id.tenant_id, shard_id=%tenant_id.shard_slug(), "Tenant shard is not active"); + continue; + } Err(e) => { // this can happen if tenant has lifecycle transition after we fetched it debug!("failed to get tenant: {e:#}"); @@ -651,6 +843,8 @@ async fn collect_eviction_candidates( continue; } + let started_at = std::time::Instant::now(); + // collect layers from all timelines in this tenant // // If one of the timelines becomes `!is_active()` during the iteration, @@ -665,11 +859,8 @@ async fn collect_eviction_candidates( } let info = tl.get_local_layers_for_disk_usage_eviction().await; debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); - tenant_candidates.extend( - info.resident_layers - .into_iter() - .map(|layer_infos| (tl.clone(), layer_infos)), - ); + + tenant_candidates.extend(info.resident_layers.into_iter()); max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); if cancel.is_cancelled() { @@ -690,14 +881,16 @@ async fn collect_eviction_candidates( // A default override can be put in the default tenant conf in the pageserver.toml. let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { debug!( - tenant_id=%tenant.tenant_id(), + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), overridden_size=s, "using overridden min resident size for tenant" ); s } else { debug!( - tenant_id=%tenant.tenant_id(), + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), max_layer_size, "using max layer size as min_resident_size for tenant", ); @@ -707,121 +900,198 @@ async fn collect_eviction_candidates( // Sort layers most-recently-used first, then partition by // cumsum above/below min_resident_size. tenant_candidates - .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); + .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; - // keeping the -1 or not decides if every tenant should lose their least recently accessed - // layer OR if this should happen in the order of having highest layer count: - let fudge = if eviction_order.highest_layer_count_loses_first() { - // relative_age vs. tenant layer count: - // - 0.1..=1.0 (10 layers) - // - 0.01..=1.0 (100 layers) - // - 0.001..=1.0 (1000 layers) - // - // leading to evicting less of the smallest tenants. - 0 - } else { - // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a - // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could - // be that less than 10k layer evictions is enough, so we would not need to evict from - // all tenants. - // - // as the tenant ordering is now deterministic this could hit the same tenants - // disproportionetly on multiple invocations. alternative could be to remember how many - // layers did we evict last time from this tenant, and inject that as an additional - // fudge here. - 1 - }; + let total = tenant_candidates.len(); - let total = tenant_candidates - .len() - .checked_sub(fudge) - .filter(|&x| x > 0) - // support 0 or 1 resident layer tenants as well - .unwrap_or(1); - let divider = total as f32; + let tenant_candidates = + tenant_candidates + .into_iter() + .enumerate() + .map(|(i, mut candidate)| { + // as we iterate this reverse sorted list, the most recently accessed layer will always + // be 1.0; this is for us to evict it last. + candidate.relative_last_activity = + eviction_order.relative_last_activity(total, i); - for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() { - let file_size = layer_info.file_size(); + let partition = if cumsum > min_resident_size as i128 { + MinResidentSizePartition::Above + } else { + MinResidentSizePartition::Below + }; + cumsum += i128::from(candidate.layer.get_file_size()); - // as we iterate this reverse sorted list, the most recently accessed layer will always - // be 1.0; this is for us to evict it last. - let relative_last_activity = if matches!( - eviction_order, - EvictionOrder::RelativeAccessed { .. } - ) { - // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or - // similarly for u16. unsure how it would help. - finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider) - .unwrap_or_else(|val| { - tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}"); - finite_f32::FiniteF32::ZERO - }) - } else { - finite_f32::FiniteF32::ZERO - }; + (partition, candidate) + }); - let candidate = EvictionCandidate { - timeline, - last_activity_ts: layer_info.last_activity_ts, - layer: layer_info.layer, - relative_last_activity, - }; - let partition = if cumsum > min_resident_size as i128 { - MinResidentSizePartition::Above - } else { - MinResidentSizePartition::Below - }; - candidates.push((partition, candidate)); - cumsum += i128::from(file_size); + METRICS + .tenant_layer_count + .observe(tenant_candidates.len() as f64); + + candidates.extend(tenant_candidates); + + let elapsed = started_at.elapsed(); + METRICS + .tenant_collection_time + .observe(elapsed.as_secs_f64()); + + if elapsed > LOG_DURATION_THRESHOLD { + tracing::info!( + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), + elapsed_ms = elapsed.as_millis(), + "collection took longer than threshold" + ); + } + } + + // Note: the same tenant ID might be hit twice, if it transitions from attached to + // secondary while we run. That is okay: when we eventually try and run the eviction, + // the `Gate` on the object will ensure that whichever one has already been shut down + // will not delete anything. + + let mut secondary_tenants = Vec::new(); + tenant_manager.foreach_secondary_tenants( + |_tenant_shard_id: &TenantShardId, state: &Arc| { + secondary_tenants.push(state.clone()); + }, + ); + + for tenant in secondary_tenants { + // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is + // to prevent repeated disk usage based evictions from completely draining less often + // updating secondaries. + let (mut layer_info, total_layers) = tenant.get_layers_for_eviction(); + + debug_assert!( + total_layers >= layer_info.resident_layers.len(), + "total_layers ({total_layers}) must be at least the resident_layers.len() ({})", + layer_info.resident_layers.len() + ); + + let started_at = std::time::Instant::now(); + + layer_info + .resident_layers + .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); + + let tenant_candidates = + layer_info + .resident_layers + .into_iter() + .enumerate() + .map(|(i, mut candidate)| { + candidate.relative_last_activity = + eviction_order.relative_last_activity(total_layers, i); + ( + // Secondary locations' layers are always considered above the min resident size, + // i.e. secondary locations are permitted to be trimmed to zero layers if all + // the layers have sufficiently old access times. + MinResidentSizePartition::Above, + candidate, + ) + }); + + METRICS + .tenant_layer_count + .observe(tenant_candidates.len() as f64); + candidates.extend(tenant_candidates); + + tokio::task::yield_now().await; + + let elapsed = started_at.elapsed(); + + METRICS + .tenant_collection_time + .observe(elapsed.as_secs_f64()); + + if elapsed > LOG_DURATION_THRESHOLD { + tracing::info!( + tenant_id=%tenant.tenant_shard_id().tenant_id, + shard_id=%tenant.tenant_shard_id().shard_slug(), + elapsed_ms = elapsed.as_millis(), + "collection took longer than threshold" + ); } } debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - match eviction_order { - EvictionOrder::AbsoluteAccessed => { - candidates.sort_unstable_by_key(|(partition, candidate)| { - (*partition, candidate.last_activity_ts) - }); - } - EvictionOrder::RelativeAccessed { .. } => { - candidates.sort_unstable_by_key(|(partition, candidate)| { - (*partition, candidate.relative_last_activity) - }); - } - } + eviction_order.sort(&mut candidates); Ok(EvictionCandidates::Finished(candidates)) } -struct TimelineKey(Arc); +/// Given a pre-sorted vec of all layers in the system, select the first N which are enough to +/// relieve pressure. +/// +/// Returns the amount of candidates selected, with the planned usage. +fn select_victims( + candidates: &[(MinResidentSizePartition, EvictionCandidate)], + usage_pre: U, +) -> VictimSelection { + let mut usage_when_switched = None; + let mut usage_planned = usage_pre; + let mut evicted_amount = 0; -impl PartialEq for TimelineKey { - fn eq(&self, other: &Self) -> bool { - Arc::ptr_eq(&self.0, &other.0) + for (i, (partition, candidate)) in candidates.iter().enumerate() { + if !usage_planned.has_pressure() { + break; + } + + if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() { + usage_when_switched = Some((usage_planned, i)); + } + + usage_planned.add_available_bytes(candidate.layer.get_file_size()); + evicted_amount += 1; + } + + VictimSelection { + amount: evicted_amount, + usage_pre, + usage_when_switched, + usage_planned, } } -impl Eq for TimelineKey {} - -impl std::hash::Hash for TimelineKey { - fn hash(&self, state: &mut H) { - Arc::as_ptr(&self.0).hash(state); - } +struct VictimSelection { + amount: usize, + usage_pre: U, + usage_when_switched: Option<(U, usize)>, + usage_planned: U, } -impl std::ops::Deref for TimelineKey { - type Target = Timeline; +impl VictimSelection { + fn into_amount_and_planned(self) -> (usize, PlannedUsage) { + debug!( + evicted_amount=%self.amount, + "took enough candidates for pressure to be relieved" + ); - fn deref(&self) -> &Self::Target { - self.0.as_ref() + if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() { + warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); + } + + let planned = match self.usage_when_switched { + Some((respecting_tenant_min_resident_size, _)) => PlannedUsage { + respecting_tenant_min_resident_size, + fallback_to_global_lru: Some(self.usage_planned), + }, + None => PlannedUsage { + respecting_tenant_min_resident_size: self.usage_planned, + fallback_to_global_lru: None, + }, + }; + + (self.amount, planned) } } /// A totally ordered f32 subset we can use with sorting functions. -mod finite_f32 { +pub(crate) mod finite_f32 { /// A totally ordered f32 subset we can use with sorting functions. #[derive(Clone, Copy, PartialEq)] @@ -865,6 +1135,12 @@ mod finite_f32 { } } + impl From for f32 { + fn from(value: FiniteF32) -> f32 { + value.0 + } + } + impl FiniteF32 { pub const ZERO: FiniteF32 = FiniteF32(0.0); @@ -877,6 +1153,10 @@ mod finite_f32 { Err(value) } } + + pub fn into_inner(self) -> f32 { + self.into() + } } } @@ -889,7 +1169,6 @@ mod filesystem_level_usage { use super::DiskUsageEvictionTaskConfig; #[derive(Debug, Clone, Copy)] - #[allow(dead_code)] pub struct Usage<'a> { config: &'a DiskUsageEvictionTaskConfig, @@ -1000,3 +1279,40 @@ mod filesystem_level_usage { assert!(!usage.has_pressure()); } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn relative_equal_bounds() { + let order = EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: false, + }; + + let len = 10; + let v = (0..len) + .map(|i| order.relative_last_activity(len, i).into_inner()) + .collect::>(); + + assert_eq!(v.first(), Some(&1.0)); + assert_eq!(v.last(), Some(&0.0)); + assert!(v.windows(2).all(|slice| slice[0] > slice[1])); + } + + #[test] + fn relative_spare_bounds() { + let order = EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }; + + let len = 10; + let v = (0..len) + .map(|i| order.relative_last_activity(len, i).into_inner()) + .collect::>(); + + assert_eq!(v.first(), Some(&1.0)); + assert_eq!(v.last(), Some(&0.1)); + assert!(v.windows(2).all(|slice| slice[0] > slice[1])); + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 1fbca1086f..71b486a4d3 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -58,24 +58,6 @@ paths: responses: "200": description: The reload completed successfully. - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error (also hits if no keys were found) - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}: parameters: @@ -93,62 +75,16 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: | Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved. 404 means that deletion successfully finished" responses: - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" + "200": + description: Tenant was successfully deleted, or was already not found. "404": - description: Tenant not found + description: Tenant not found. This is a success result, equivalent to 200. content: application/json: schema: @@ -165,19 +101,35 @@ paths: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_id}/time_travel_remote_storage: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: travel_to + in: query + required: true + schema: + type: string + format: date-time + - name: done_if_after + in: query + required: true + schema: + type: string + format: date-time + put: + description: Time travel the tenant's remote storage + responses: + "200": + description: OK + content: + application/json: + schema: + type: string /v1/tenant/{tenant_id}/timeline: parameters: @@ -197,36 +149,6 @@ paths: type: array items: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}: @@ -251,60 +173,12 @@ paths: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" delete: description: "Attempts to delete specified timeline. 500 and 409 errors should be retried" responses: - "400": - description: Error when no tenant id found in path or no timeline id - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "404": - description: Timeline not found + description: Timeline not found. This is the success path. content: application/json: schema: @@ -321,18 +195,6 @@ paths: application/json: schema: $ref: "#/components/schemas/PreconditionFailedError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn: parameters: @@ -365,36 +227,6 @@ paths: schema: type: string format: date-time - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Timeline not found, or there is no timestamp information for the given lsn - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: @@ -419,12 +251,6 @@ paths: type: string format: date-time description: A timestamp to get the LSN - - name: version - in: query - required: false - schema: - type: integer - description: The version of the endpoint to use responses: "200": description: OK @@ -432,36 +258,37 @@ paths: application/json: schema: $ref: "#/components/schemas/LsnByTimestampResponse" - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + format: hex + post: + description: Obtain lease for the given LSN + parameters: + - name: lsn + in: query + required: true + schema: + type: string + format: hex + description: A LSN to obtain the lease for + responses: + "200": + description: OK content: application/json: schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" + $ref: "#/components/schemas/LsnLease" /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc: parameters: @@ -485,147 +312,9 @@ paths: application/json: schema: type: string - "400": - description: Error when no tenant id found in path, no timeline id or invalid timestamp - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/attach: + /v1/tenant/{tenant_shard_id}/location_config: parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - post: - description: | - Schedules attach operation to happen in the background for the given tenant. - As soon as the caller sends this request, it must assume the pageserver - starts writing to the tenant's S3 state unless it receives one of the - distinguished errors below that state otherwise. - - If a client receives a not-distinguished response, e.g., a network timeout, - it MUST retry the /attach request and poll again for the tenant's - attachment status. - - After the client has received a 202, it MUST poll the tenant's - attachment status (field `attachment_status`) to reach state `attached`. - If the `attachment_status` is missing, the client MUST retry the `/attach` - request (goto previous paragraph). This is a robustness measure in case the tenant - status endpoint is buggy, but the attach operation is ongoing. - - There is no way to cancel an in-flight request. - - In any case, the client - * MUST NOT ASSUME that the /attach request has been lost in the network, - * MUST NOT ASSUME that the request has been lost, based on the observation - that a subsequent tenant status request returns 404. The request may - still be in flight. It must be retried. - - The client SHOULD supply a `TenantConfig` for the tenant in the request body. - Settings specified in the config override the pageserver's defaults. - It is guaranteed that the config settings are applied before the pageserver - starts operating on the tenant. E.g., if the config specifies a specific - PITR interval for a tenant, then that setting will be in effect before the - pageserver starts the garbage collection loop. This enables a client to - guarantee a specific PITR setting across detach/attach cycles. - The pageserver will reject the request if it cannot parse the config, or - if there are any unknown fields in it. - - If the client does not supply a config, the pageserver will use its defaults. - This behavior is deprecated: https://github.com/neondatabase/neon/issues/4282 - requestBody: - required: false - content: - application/json: - schema: - $ref: "#/components/schemas/TenantAttachRequest" - responses: - "202": - description: Tenant attaching scheduled - "400": - description: Bad Request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Timeline not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "409": - description: | - The tenant is already known to Pageserver in some way, - and hence this `/attach` call has been rejected. - - Some examples of how this can happen: - - tenant was created on this pageserver - - tenant attachment was started by an earlier call to `/attach`. - - Callers should poll the tenant status's `attachment_status` field, - like for status 202. See the longer description for `POST /attach` - for details. - content: - application/json: - schema: - $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - - /v1/tenant/{tenant_id}/location_config: - parameters: - - name: tenant_id + - name: tenant_shard_id in: path required: true schema: @@ -635,6 +324,12 @@ paths: required: false schema: type: integer + - name: lazy + in: query + required: false + schema: + type: boolean + description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default. put: description: | Configures a _tenant location_, that is how a particular pageserver handles @@ -674,24 +369,10 @@ paths: responses: "200": description: Tenant is now in requested state - "503": - description: Tenant's state cannot be changed right now. Wait a few seconds and retry. content: application/json: schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" + $ref: "#/components/schemas/TenantLocationConfigResponse" "409": description: | The tenant is already known to Pageserver in some way, @@ -708,72 +389,6 @@ paths: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - - /v1/tenant/{tenant_id}/detach: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - - name: detach_ignored - in: query - required: false - schema: - type: boolean - description: | - When true, allow to detach a tenant which state is ignored. - post: - description: | - Remove tenant data (including all corresponding timelines) from pageserver's memory and file system. - Files on the remote storage are not affected. - responses: - "200": - description: Tenant detached - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Tenant not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/ignore: parameters: - name: tenant_id @@ -790,36 +405,6 @@ paths: responses: "200": description: Tenant ignored - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/load: @@ -846,81 +431,28 @@ paths: responses: "202": description: Tenant scheduled to load successfully - "400": - description: Error when no tenant id found in path parameters - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - - /v1/tenant/{tenant_id}/synthetic_size: + /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive: parameters: - name: tenant_id in: path required: true schema: type: string - get: + - name: timeline_id + in: path + required: true + schema: + type: string + post: description: | - Calculate tenant's synthetic size + Marks the initdb archive for preservation upon deletion of the timeline or tenant. + This is meant to be part of the disaster recovery process. responses: - "200": - description: Tenant's synthetic size - content: - application/json: - schema: - $ref: "#/components/schemas/SyntheticSizeResponse" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" + "202": + description: Tenant scheduled to load successfully - /v1/tenant/{tenant_id}/size: + /v1/tenant/{tenant_id}/synthetic_size: parameters: - name: tenant_id in: path @@ -950,19 +482,9 @@ paths: content: application/json: schema: - type: object - required: - - id - - size - properties: - id: - type: string - format: hex - size: - type: integer - nullable: true - description: | - Size metric in bytes or null if inputs_only=true was given. + $ref: "#/components/schemas/SyntheticSizeResponse" + text/html: + description: SVG representation of the tenant and it's timelines. "401": description: Unauthorized Error content: @@ -988,6 +510,49 @@ paths: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/{tenant_shard_id}/heatmap_upload: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + post: + description: | + If the location is in an attached mode, upload the current state to the remote heatmap + responses: + "200": + description: Success + + /v1/tenant/{tenant_shard_id}/secondary/download: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: wait_ms + description: If set, we will wait this long for download to complete, and if it isn't complete then return 202 + in: query + required: false + schema: + type: integer + post: + description: | + If the location is in secondary mode, download latest heatmap and layers + responses: + "200": + description: Success + content: + application/json: + schema: + $ref: "#/components/schemas/SecondaryProgress" + "202": + description: Download has started but not yet finished + content: + application/json: + schema: + $ref: "#/components/schemas/SecondaryProgress" /v1/tenant/{tenant_id}/timeline/: parameters: @@ -1025,29 +590,11 @@ paths: format: hex responses: "201": - description: TimelineInfo + description: Timeline was created, or already existed with matching parameters content: application/json: schema: $ref: "#/components/schemas/TimelineInfo" - "400": - description: Malformed timeline create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "406": description: Permanently unsatisfiable request, don't retry. content: @@ -1055,24 +602,92 @@ paths: schema: $ref: "#/components/schemas/Error" "409": - description: Timeline already exists, creation skipped + description: Timeline already exists, with different parameters. Creation cannot proceed. content: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error + "429": + description: A creation request was sent for the same Timeline Id while a creation was already in progress. Back off and retry. content: application/json: schema: $ref: "#/components/schemas/Error" + + /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor: + parameters: + - name: tenant_shard_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + ŕequired: true + schema: + type: string + + put: + description: | + Detach a timeline from its ancestor and reparent all ancestors timelines with lower `ancestor_lsn`. + Current implementation might not be retryable across failure cases, but will be enhanced in future. + Detaching should be expected to be expensive operation. Timeouts should be retried. + responses: + "200": + description: | + The timeline has been detached from it's ancestor (now or earlier), and at least the returned timelines have been reparented. + If any timelines were deleted after reparenting, they might not be on this list. + content: + application/json: + schema: + $ref: "#/components/schemas/AncestorDetached" + + "400": + description: | + Number of early checks meaning the timeline cannot be detached now: + - the ancestor of timeline has an ancestor: not supported, see RFC + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + + "404": + description: Tenant or timeline not found. + content: + application/json: + schema: + $ref: "#/components/schemas/NotFoundError" + + "409": + description: | + The timeline can never be detached: + - timeline has no ancestor, implying that the timeline has never had an ancestor + content: + application/json: + schema: + $ref: "#/components/schemas/ConflictError" + + "500": + description: | + Transient error, for example, pageserver shutdown happened while + processing the request but we were unable to distinguish that. Must + be retried. + content: + application/json: + schema: + $ref: "#/components/schemas/Error" + "503": - description: Temporarily unavailable, please retry. + description: | + Temporarily unavailable, please retry. Possible reasons: + - another timeline detach for the same tenant is underway, please retry later + - detected shutdown error content: application/json: schema: $ref: "#/components/schemas/ServiceUnavailableError" + /v1/tenant/: get: description: Get tenants list @@ -1085,30 +700,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" post: description: | @@ -1129,43 +720,12 @@ paths: application/json: schema: type: string - "400": - description: Malformed tenant create request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" "409": description: Tenant already exists, creation skipped content: application/json: schema: $ref: "#/components/schemas/ConflictError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/config: put: @@ -1187,36 +747,6 @@ paths: type: array items: $ref: "#/components/schemas/TenantInfo" - "400": - description: Malformed tenant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" /v1/tenant/{tenant_id}/config/: parameters: @@ -1236,42 +766,19 @@ paths: application/json: schema: $ref: "#/components/schemas/TenantConfigResponse" - "400": - description: Malformed get tenanant config request - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "404": - description: Tenand or timeline were not found - content: - application/json: - schema: - $ref: "#/components/schemas/NotFoundError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "503": - description: Temporarily unavailable, please retry. - content: - application/json: - schema: - $ref: "#/components/schemas/ServiceUnavailableError" + + /v1/utilization: + get: + description: | + Returns the pageservers current utilization and fitness score for new tenants. + + responses: + "200": + description: Pageserver utilization and fitness score + content: + application/json: + schema: + $ref: "#/components/schemas/PageserverUtilization" components: securitySchemes: @@ -1339,16 +846,6 @@ components: generation: type: integer description: Attachment generation number. - TenantAttachRequest: - type: object - required: - - config - properties: - config: - $ref: '#/components/schemas/TenantConfig' - generation: - type: integer - description: Attachment generation number. TenantConfigRequest: allOf: - $ref: '#/components/schemas/TenantConfig' @@ -1361,10 +858,8 @@ components: TenantLocationConfigRequest: type: object required: - - tenant_id + - mode properties: - tenant_id: - type: string mode: type: string enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] @@ -1376,6 +871,32 @@ components: $ref: '#/components/schemas/SecondaryConfig' tenant_conf: $ref: '#/components/schemas/TenantConfig' + TenantLocationConfigResponse: + type: object + required: + - shards + properties: + shards: + description: Pageservers where this tenant's shards are attached. Not populated for secondary locations. + type: array + items: + $ref: "#/components/schemas/TenantShardLocation" + stripe_size: + description: If multiple shards are present, this field contains the sharding stripe size, else it is null. + type: integer + nullable: true + TenantShardLocation: + type: object + required: + - node_id + - shard_id + properties: + node_id: + description: Pageserver node ID where this shard is attached + type: integer + shard_id: + description: Tenant shard ID of the shard + type: string SecondaryConfig: type: object properties: @@ -1412,7 +933,7 @@ components: trace_read_requests: type: boolean heatmap_period: - type: integer + type: string TenantConfigResponse: type: object properties: @@ -1486,6 +1007,9 @@ components: format: hex size: type: integer + nullable: true + description: | + Size metric in bytes or null if inputs_only=true was given. segment_sizes: type: array items: @@ -1563,6 +1087,86 @@ components: type: string enum: [past, present, future, nodata] + LsnLease: + type: object + required: + - valid_until + properties: + valid_until: + type: string + format: date-time + + PageserverUtilization: + type: object + required: + - disk_usage_bytes + - free_space_bytes + - utilization_score + properties: + disk_usage_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of disk space currently used. + free_space_bytes: + type: integer + format: int64 + minimum: 0 + description: The amount of usable disk space left. + utilization_score: + type: integer + format: int64 + minimum: 0 + maximum: 9223372036854775807 + default: 9223372036854775807 + description: | + Lower is better score for how good this pageserver would be for the next tenant. + The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated. + + SecondaryProgress: + type: object + required: + - heatmap_mtime + - layers_downloaded + - layers_total + - bytes_downloaded + - bytes_total + properties: + heatmap_mtime: + type: string + format: date-time + description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format) + layers_downloaded: + type: integer + format: int64 + description: How many layers from the latest layer heatmap are present on disk + bytes_downloaded: + type: integer + format: int64 + description: How many bytes of layer content from the latest layer heatmap are present on disk + layers_total: + type: integer + format: int64 + description: How many layers were in the latest layer heatmap + bytes_total: + type: integer + format: int64 + description: How many bytes of layer content were in the latest layer heatmap + + AncestorDetached: + type: object + required: + - reparented_timelines + properties: + reparented_timelines: + type: array + description: Set of reparented timeline ids + properties: + type: string + format: hex + description: TimelineId + + Error: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5c7747d353..482879630a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,6 +1,8 @@ //! //! Management HTTP API //! +use std::cmp::Reverse; +use std::collections::BinaryHeap; use std::collections::HashMap; use std::str::FromStr; use std::sync::Arc; @@ -14,18 +16,39 @@ use hyper::header; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::IngestAuxFilesRequest; +use pageserver_api::models::ListAuxFilesRequest; +use pageserver_api::models::LocationConfig; +use pageserver_api::models::LocationConfigListResponse; +use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; +use pageserver_api::models::TenantLocationConfigResponse; +use pageserver_api::models::TenantScanRemoteStorageResponse; +use pageserver_api::models::TenantScanRemoteStorageShard; +use pageserver_api::models::TenantShardLocation; +use pageserver_api::models::TenantShardSplitRequest; +use pageserver_api::models::TenantShardSplitResponse; +use pageserver_api::models::TenantSorting; +use pageserver_api::models::TenantState; +use pageserver_api::models::TopTenantShardItem; +use pageserver_api::models::TopTenantShardsRequest; +use pageserver_api::models::TopTenantShardsResponse; use pageserver_api::models::{ DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest, TenantLoadRequest, TenantLocationConfigRequest, }; +use pageserver_api::shard::ShardCount; use pageserver_api::shard::TenantShardId; +use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; +use remote_storage::TimeTravelError; use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; use utils::http::json::json_request_or_empty_body; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -38,15 +61,24 @@ use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; use crate::tenant::mgr::GetActiveTenantError; use crate::tenant::mgr::{ - GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError, - TenantSlotError, TenantSlotUpsertError, TenantStateError, + GetTenantError, TenantManager, TenantMapError, TenantMapInsertError, TenantSlotError, + TenantSlotUpsertError, TenantStateError, }; +use crate::tenant::mgr::{TenantSlot, UpsertLocationError}; +use crate::tenant::remote_timeline_client; +use crate::tenant::remote_timeline_client::download_index_part; +use crate::tenant::remote_timeline_client::list_remote_tenant_shards; +use crate::tenant::remote_timeline_client::list_remote_timelines; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; +use crate::tenant::storage_layer::LayerName; use crate::tenant::timeline::CompactFlags; +use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; -use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources}; +use crate::tenant::GetTimelineError; +use crate::tenant::SpawnMode; +use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ @@ -70,18 +102,25 @@ use utils::{ // For APIs that require an Active tenant, how long should we block waiting for that state? // This is not functionally necessary (clients will retry), but avoids generating a lot of // failed API calls while tenants are activating. -const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); +#[cfg(not(feature = "testing"))] +pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); + +// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to +// finish attaching, if calls to remote storage are slow. +#[cfg(feature = "testing")] +pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); pub struct State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, allowlist_routes: Vec, - remote_storage: Option, + remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, + latest_utilization: tokio::sync::Mutex>, } impl State { @@ -90,7 +129,7 @@ impl State { conf: &'static PageServerConf, tenant_manager: Arc, auth: Option>, - remote_storage: Option, + remote_storage: GenericRemoteStorage, broker_client: storage_broker::BrokerClientChannel, disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, @@ -110,16 +149,9 @@ impl State { disk_usage_eviction_state, deletion_queue_client, secondary_controller, + latest_utilization: Default::default(), }) } - - fn tenant_resources(&self) -> TenantSharedResources { - TenantSharedResources { - broker_client: self.broker_client.clone(), - remote_storage: self.remote_storage.clone(), - deletion_queue_client: self.deletion_queue_client.clone(), - } - } } #[inline(always)] @@ -146,12 +178,10 @@ impl From for ApiError { fn from(pre: PageReconstructError) -> ApiError { match pre { PageReconstructError::Other(pre) => ApiError::InternalServerError(pre), - PageReconstructError::Cancelled => { - ApiError::InternalServerError(anyhow::anyhow!("request was cancelled")) - } - PageReconstructError::AncestorStopping(_) => { - ApiError::ResourceUnavailable(format!("{pre}").into()) + PageReconstructError::MissingKey(e) => { + ApiError::InternalServerError(anyhow::anyhow!("{e}")) } + PageReconstructError::Cancelled => ApiError::Cancelled, PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()), PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), } @@ -175,7 +205,7 @@ impl From for ApiError { NotFound(tenant_id) => { ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into()) } - e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")), + e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")), InProgress => { ApiError::ResourceUnavailable("Tenant is being modified concurrently".into()) } @@ -190,6 +220,19 @@ impl From for ApiError { match e { InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")), MapState(e) => e.into(), + ShuttingDown(_) => ApiError::ShuttingDown, + } + } +} + +impl From for ApiError { + fn from(e: UpsertLocationError) -> ApiError { + use UpsertLocationError::*; + match e { + BadRequest(e) => ApiError::BadRequest(e), + Unavailable(_) => ApiError::ShuttingDown, + e @ InProgress => ApiError::Conflict(format!("{e}")), + Flush(e) | Other(e) => ApiError::InternalServerError(e), } } } @@ -222,16 +265,11 @@ impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), - GetTenantError::Broken(reason) => { - ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason)) - } GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does // in fact exist locally. If we did, the caller could draw the conclusion // that it can attach the tenant to another PS and we'd be in split-brain. - // - // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls). ApiError::ResourceUnavailable("Tenant not yet active".into()) } GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()), @@ -239,9 +277,19 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(gte: GetTimelineError) -> Self { + // Rationale: tenant is activated only after eligble timelines activate + ApiError::NotFound(gte.into()) + } +} + impl From for ApiError { fn from(e: GetActiveTenantError) -> ApiError { match e { + GetActiveTenantError::Broken(reason) => { + ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason)) + } GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)), GetActiveTenantError::Cancelled => ApiError::ShuttingDown, GetActiveTenantError::NotFound(gte) => gte.into(), @@ -252,19 +300,6 @@ impl From for ApiError { } } -impl From for ApiError { - fn from(e: SetNewTenantConfigError) -> ApiError { - match e { - SetNewTenantConfigError::GetTenant(tid) => { - ApiError::NotFound(anyhow!("tenant {}", tid).into()) - } - e @ SetNewTenantConfigError::Persist(_) => { - ApiError::InternalServerError(anyhow::Error::new(e)) - } - } - } -} - impl From for ApiError { fn from(value: crate::tenant::DeleteTimelineError) -> Self { use crate::tenant::DeleteTimelineError::*; @@ -316,11 +351,21 @@ impl From for ApiError { async fn build_timeline_info( timeline: &Arc, include_non_incremental_logical_size: bool, + force_await_initial_logical_size: bool, ctx: &RequestContext, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); - let mut info = build_timeline_info_common(timeline, ctx).await?; + if force_await_initial_logical_size { + timeline.clone().await_initial_logical_size().await + } + + let mut info = build_timeline_info_common( + timeline, + ctx, + tenant::timeline::GetLogicalSizePriority::Background, + ) + .await?; if include_non_incremental_logical_size { // XXX we should be using spawn_ondemand_logical_size_calculation here. // Otherwise, if someone deletes the timeline / detaches the tenant while @@ -337,6 +382,7 @@ async fn build_timeline_info( async fn build_timeline_info_common( timeline: &Arc, ctx: &RequestContext, + logical_size_task_priority: tenant::timeline::GetLogicalSizePriority, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); let initdb_lsn = timeline.initdb_lsn; @@ -345,7 +391,7 @@ async fn build_timeline_info_common( let guard = timeline.last_received_wal.lock().unwrap(); if let Some(info) = guard.as_ref() { ( - Some(format!("{:?}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only. + Some(format!("{}", info.wal_source_connconf)), // Password is hidden, but it's for statistics only. Some(info.last_received_msg_lsn), Some(info.last_received_msg_ts), ) @@ -359,8 +405,7 @@ async fn build_timeline_info_common( Lsn(0) => None, lsn @ Lsn(_) => Some(lsn), }; - let current_logical_size = - timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx); + let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx); let current_physical_size = Some(timeline.layer_size_sum().await); let state = timeline.current_state(); let remote_consistent_lsn_projected = timeline @@ -389,6 +434,7 @@ async fn build_timeline_info_common( tenant::timeline::logical_size::Accuracy::Approximate => false, tenant::timeline::logical_size::Accuracy::Exact => true, }, + directory_entries_counts: timeline.get_directory_metrics().to_vec(), current_physical_size, current_logical_size_non_incremental: None, timeline_dir_layer_file_size_sum: None, @@ -400,6 +446,8 @@ async fn build_timeline_info_common( state, walreceiver_status, + + last_aux_file_policy: timeline.last_aux_file_policy.load(), }; Ok(info) } @@ -434,8 +482,12 @@ async fn reload_auth_validation_keys_handler( json_response(StatusCode::OK, ()) } Err(e) => { + let err_msg = "Error reloading public keys"; warn!("Error reloading public keys from {key_path:?}: {e:}"); - json_response(StatusCode::INTERNAL_SERVER_ERROR, ()) + json_response( + StatusCode::INTERNAL_SERVER_ERROR, + HttpErrorBody::from_msg(err_msg.to_string()), + ) } } } @@ -455,48 +507,77 @@ async fn timeline_create_handler( let state = get_state(&request); async { - let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - match tenant.create_timeline( - new_timeline_id, - request_data.ancestor_timeline_id.map(TimelineId::from), - request_data.ancestor_start_lsn, - request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), - request_data.existing_initdb_timeline_id, - state.broker_client.clone(), - &ctx, - ) - .await { + if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() { + tracing::info!(%ancestor_id, "starting to branch"); + } else { + tracing::info!("bootstrapping"); + } + + match tenant + .create_timeline( + new_timeline_id, + request_data.ancestor_timeline_id, + request_data.ancestor_start_lsn, + request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION), + request_data.existing_initdb_timeline_id, + state.broker_client.clone(), + &ctx, + ) + .await + { Ok(new_timeline) => { // Created. Construct a TimelineInfo for it. - let timeline_info = build_timeline_info_common(&new_timeline, &ctx) - .await - .map_err(ApiError::InternalServerError)?; + let timeline_info = build_timeline_info_common( + &new_timeline, + &ctx, + tenant::timeline::GetLogicalSizePriority::User, + ) + .await + .map_err(ApiError::InternalServerError)?; json_response(StatusCode::CREATED, timeline_info) } - Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => { - json_response(StatusCode::CONFLICT, ()) + Err(_) if tenant.cancel.is_cancelled() => { + // In case we get some ugly error type during shutdown, cast it into a clean 503. + json_response( + StatusCode::SERVICE_UNAVAILABLE, + HttpErrorBody::from_msg("Tenant shutting down".to_string()), + ) } - Err(tenant::CreateTimelineError::AncestorLsn(err)) => { - json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg( - format!("{err:#}") - )) - } - Err(e @ tenant::CreateTimelineError::AncestorNotActive) => { - json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string())) - } - Err(tenant::CreateTimelineError::ShuttingDown) => { - json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string())) + Err(e @ tenant::CreateTimelineError::Conflict) => { + json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())) } + Err(e @ tenant::CreateTimelineError::AlreadyCreating) => json_response( + StatusCode::TOO_MANY_REQUESTS, + HttpErrorBody::from_msg(e.to_string()), + ), + Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response( + StatusCode::NOT_ACCEPTABLE, + HttpErrorBody::from_msg(format!("{err:#}")), + ), + Err(e @ tenant::CreateTimelineError::AncestorNotActive) => json_response( + StatusCode::SERVICE_UNAVAILABLE, + HttpErrorBody::from_msg(e.to_string()), + ), + Err(tenant::CreateTimelineError::ShuttingDown) => json_response( + StatusCode::SERVICE_UNAVAILABLE, + HttpErrorBody::from_msg("tenant shutting down".to_string()), + ), Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)), } } .instrument(info_span!("timeline_create", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug(), - timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version)) + shard_id = %tenant_shard_id.shard_slug(), + timeline_id = %new_timeline_id, + lsn=?request_data.ancestor_start_lsn, + pg_version=?request_data.pg_version + )) .await } @@ -507,12 +588,20 @@ async fn timeline_list_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; + let force_await_initial_logical_size: Option = + parse_query_param(&request, "force-await-initial-logical-size")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); @@ -520,6 +609,7 @@ async fn timeline_list_handler( let timeline_info = build_timeline_info( &timeline, include_non_incremental_logical_size.unwrap_or(false), + force_await_initial_logical_size.unwrap_or(false), &ctx, ) .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id)) @@ -539,6 +629,44 @@ async fn timeline_list_handler( json_response(StatusCode::OK, response_data) } +async fn timeline_preserve_initdb_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + + // Part of the process for disaster recovery from safekeeper-stored WAL: + // If we don't recover into a new timeline but want to keep the timeline ID, + // then the initdb archive is deleted. This endpoint copies it to a different + // location where timeline recreation cand find it. + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + let timeline = tenant.get_timeline(timeline_id, false)?; + + timeline + .preserve_initdb_archive() + .await + .context("preserving initdb archive") + .map_err(ApiError::InternalServerError)?; + + Ok::<_, ApiError>(()) + } + .instrument(info_span!("timeline_preserve_initdb_archive", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + %timeline_id)) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, @@ -547,21 +675,27 @@ async fn timeline_detail_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; + let force_await_initial_logical_size: Option = + parse_query_param(&request, "force-await-initial-logical-size")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; // Logical size calculation needs downloading. let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let state = get_state(&request); let timeline_info = async { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; - let timeline = tenant - .get_timeline(timeline_id, false) - .map_err(|e| ApiError::NotFound(e.into()))?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let timeline = tenant.get_timeline(timeline_id, false)?; let timeline_info = build_timeline_info( &timeline, include_non_incremental_logical_size.unwrap_or(false), + force_await_initial_logical_size.unwrap_or(false), &ctx, ) .await @@ -585,8 +719,9 @@ async fn get_lsn_by_timestamp_handler( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -601,11 +736,14 @@ async fn get_lsn_by_timestamp_handler( let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; - #[derive(serde::Serialize)] + #[derive(serde::Serialize, Debug)] struct Result { lsn: Lsn, kind: &'static str, @@ -616,7 +754,14 @@ async fn get_lsn_by_timestamp_handler( LsnForTimestamp::Past(lsn) => (lsn, "past"), LsnForTimestamp::NoData(lsn) => (lsn, "nodata"), }; - json_response(StatusCode::OK, Result { lsn, kind }) + let result = Result { lsn, kind }; + tracing::info!( + lsn=?result.lsn, + kind=%result.kind, + timestamp=%timestamp_raw, + "lsn_by_timestamp finished" + ); + json_response(StatusCode::OK, result) } async fn get_timestamp_of_lsn_handler( @@ -625,8 +770,9 @@ async fn get_timestamp_of_lsn_handler( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" @@ -641,7 +787,9 @@ async fn get_timestamp_of_lsn_handler( .map_err(ApiError::BadRequest)?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { @@ -649,7 +797,9 @@ async fn get_timestamp_of_lsn_handler( let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string(); json_response(StatusCode::OK, time) } - None => json_response(StatusCode::NOT_FOUND, ()), + None => Err(ApiError::NotFound( + anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(), + )), } } @@ -674,23 +824,34 @@ async fn tenant_attach_handler( let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?; - if state.remote_storage.is_none() { - return Err(ApiError::BadRequest(anyhow!( - "attach_tenant is not possible because pageserver was configured without remote storage" + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let shard_params = ShardParameters::default(); + let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params); + + let tenant = state + .tenant_manager + .upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx) + .await?; + + let Some(tenant) = tenant else { + // This should never happen: indicates a bug in upsert_location + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Upsert succeeded but didn't return tenant!" + ))); + }; + + // We might have successfully constructed a Tenant, but it could still + // end up in a broken state: + if let TenantState::Broken { + reason, + backtrace: _, + } = tenant.current_state() + { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Tenant state is Broken: {reason}" ))); } - mgr::attach_tenant( - state.conf, - tenant_id, - generation, - tenant_conf, - state.tenant_resources(), - &ctx, - ) - .instrument(info_span!("tenant_attach", %tenant_id)) - .await?; - json_response(StatusCode::ACCEPTED, ()) } @@ -706,7 +867,7 @@ async fn timeline_delete_handler( let tenant = state .tenant_manager - .get_attached_tenant_shard(tenant_shard_id, false) + .get_attached_tenant_shard(tenant_shard_id) .map_err(|e| { match e { // GetTenantError has a built-in conversion to ApiError, but in this context we don't @@ -718,7 +879,7 @@ async fn timeline_delete_handler( } })?; tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id)) + tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) .await?; json_response(StatusCode::ACCEPTED, ()) @@ -737,14 +898,16 @@ async fn tenant_detach_handler( let state = get_state(&request); let conf = state.conf; - mgr::detach_tenant( - conf, - tenant_shard_id, - detach_ignored.unwrap_or(false), - &state.deletion_queue_client, - ) - .instrument(info_span!("tenant_detach", %tenant_id)) - .await?; + state + .tenant_manager + .detach_tenant( + conf, + tenant_shard_id, + detach_ignored.unwrap_or(false), + &state.deletion_queue_client, + ) + .instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug())) + .await?; json_response(StatusCode::OK, ()) } @@ -762,7 +925,7 @@ async fn tenant_reset_handler( let state = get_state(&request); state .tenant_manager - .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx) + .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx) .await .map_err(ApiError::InternalServerError)?; @@ -822,19 +985,21 @@ async fn tenant_list_handler( _cancel: CancellationToken, ) -> Result, ApiError> { check_permission(&request, None)?; + let state = get_state(&request); - let response_data = mgr::list_tenants() - .instrument(info_span!("tenant_list")) - .await + let response_data = state + .tenant_manager + .list_tenants() .map_err(|_| { ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into()) })? .iter() - .map(|(id, state)| TenantInfo { + .map(|(id, state, gen)| TenantInfo { id: *id, state: state.clone(), current_physical_size: None, attachment_status: state.attachment_status(), + generation: (*gen).into(), }) .collect::>(); @@ -847,9 +1012,27 @@ async fn tenant_status( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + + // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting. + let activate = true; + #[cfg(feature = "testing")] + let activate = parse_query_param(&request, "activate")?.unwrap_or(activate); let tenant_info = async { - let tenant = mgr::get_tenant(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + if activate { + // This is advisory: we prefer to let the tenant activate on-demand when this function is + // called, but it is still valid to return 200 and describe the current state of the tenant + // if it doesn't make it into an active state. + tenant + .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) + .await + .ok(); + } // Calculate total physical size of all timelines let mut current_physical_size = 0; @@ -864,7 +1047,9 @@ async fn tenant_status( state: state.clone(), current_physical_size: Some(current_physical_size), attachment_status: state.attachment_status(), + generation: tenant.generation().into(), }, + walredo: tenant.wal_redo_manager_status(), timelines: tenant.list_timeline_ids(), }) } @@ -886,16 +1071,23 @@ async fn tenant_delete_handler( let state = get_state(&request); - state + let status = state .tenant_manager .delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT) .instrument(info_span!("tenant_delete_handler", tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug() + shard_id = %tenant_shard_id.shard_slug() )) .await?; - json_response(StatusCode::ACCEPTED, ()) + // Callers use 404 as success for deletions, for historical reasons. + if status == StatusCode::NOT_FOUND { + return Err(ApiError::NotFound( + anyhow::anyhow!("Deletion complete").into(), + )); + } + + json_response(status, ()) } /// HTTP endpoint to query the current tenant_size of a tenant. @@ -920,16 +1112,20 @@ async fn tenant_size_handler( let inputs_only: Option = parse_query_param(&request, "inputs_only")?; let retention_period: Option = parse_query_param(&request, "retention_period")?; let headers = request.headers(); + let state = get_state(&request); - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let tenant = mgr::get_tenant(tenant_shard_id, true)?; - - if !tenant_shard_id.is_zero() { + if !tenant_shard_id.is_shard_zero() { return Err(ApiError::BadRequest(anyhow!( "Size calculations are only available on shard zero" ))); } + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + // this can be long operation let inputs = tenant .gather_size_inputs( @@ -939,7 +1135,10 @@ async fn tenant_size_handler( &ctx, ) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| match e { + crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown, + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + })?; let mut sizes = None; let accepts_html = headers @@ -947,9 +1146,7 @@ async fn tenant_size_handler( .map(|v| v == "text/html") .unwrap_or_default(); if !inputs_only.unwrap_or(false) { - let storage_model = inputs - .calculate_model() - .map_err(ApiError::InternalServerError)?; + let storage_model = inputs.calculate_model(); let size = storage_model.calculate(); // If request header expects html, return html @@ -988,6 +1185,35 @@ async fn tenant_size_handler( ) } +async fn tenant_shard_split_handler( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let req: TenantShardSplitRequest = json_request(&mut request).await?; + + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let state = get_state(&request); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let new_shards = state + .tenant_manager + .shard_split( + tenant, + ShardCount::new(req.new_shard_count), + req.new_stripe_size, + &ctx, + ) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, TenantShardSplitResponse { new_shards }) +} + async fn layer_map_info_handler( request: Request, _cancel: CancellationToken, @@ -996,10 +1222,13 @@ async fn layer_map_info_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset); + let state = get_state(&request); check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let layer_map_info = timeline.layer_map_info(reset).await; json_response(StatusCode::OK, layer_map_info) @@ -1013,10 +1242,15 @@ async fn layer_download_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let layer_name = LayerName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; + let state = get_state(&request); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let downloaded = timeline - .download_layer(layer_file_name) + .download_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; @@ -1038,10 +1272,16 @@ async fn evict_timeline_layer_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; + let state = get_state(&request); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let layer_name = LayerName::from_str(layer_file_name) + .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?; + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let evicted = timeline - .evict_layer(layer_file_name) + .evict_layer(&layer_name) .await .map_err(ApiError::InternalServerError)?; @@ -1148,36 +1388,35 @@ async fn tenant_create_handler( let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - let new_tenant = mgr::create_tenant( - state.conf, - tenant_conf, - target_tenant_id, - generation, - state.tenant_resources(), - &ctx, - ) - .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id)) - .await?; + let location_conf = + LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters); + let new_tenant = state + .tenant_manager + .upsert_location( + target_tenant_id, + location_conf, + None, + SpawnMode::Create, + &ctx, + ) + .await?; + + let Some(new_tenant) = new_tenant else { + // This should never happen: indicates a bug in upsert_location + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Upsert succeeded but didn't return tenant!" + ))); + }; // We created the tenant. Existing API semantics are that the tenant // is Active when this function returns. - if let res @ Err(_) = new_tenant + new_tenant .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) - .await - { - // This shouldn't happen because we just created the tenant directory - // in tenant::mgr::create_tenant, and there aren't any remote timelines - // to load, so, nothing can really fail during load. - // Don't do cleanup because we don't know how we got here. - // The tenant will likely be in `Broken` state and subsequent - // calls will fail. - res.context("created tenant failed to become active") - .map_err(ApiError::InternalServerError)?; - } + .await?; json_response( StatusCode::CREATED, - TenantCreateResponse(new_tenant.tenant_id()), + TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id), ) } @@ -1187,8 +1426,11 @@ async fn get_tenant_config_handler( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); - let tenant = mgr::get_tenant(tenant_shard_id, false)?; + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; let response = HashMap::from([ ( @@ -1216,13 +1458,31 @@ async fn update_tenant_config_handler( let tenant_id = request_data.tenant_id; check_permission(&request, Some(tenant_id))?; - let tenant_conf = + let new_tenant_conf = TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; let state = get_state(&request); - mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id) - .instrument(info_span!("tenant_config", %tenant_id)) - .await?; + + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + // This is a legacy API that only operates on attached tenants: the preferred + // API to use is the location_config/ endpoint, which lets the caller provide + // the full LocationConf. + let location_conf = LocationConf::attached_single( + new_tenant_conf.clone(), + tenant.get_generation(), + &ShardParameters::default(), + ); + + crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) + .await + .map_err(ApiError::InternalServerError)?; + tenant.set_new_tenant_config(new_tenant_conf); json_response(StatusCode::OK, ()) } @@ -1235,6 +1495,7 @@ async fn put_tenant_location_config_handler( let request_data: TenantLocationConfigRequest = json_request(&mut request).await?; let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis); + let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false); check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); @@ -1244,13 +1505,14 @@ async fn put_tenant_location_config_handler( // The `Detached` state is special, it doesn't upsert a tenant, it removes // its local disk content and drops it from memory. if let LocationConfigMode::Detached = request_data.config.mode { - if let Err(e) = - mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) - .instrument(info_span!("tenant_detach", - tenant_id = %tenant_shard_id.tenant_id, - shard = %tenant_shard_id.shard_slug() - )) - .await + if let Err(e) = state + .tenant_manager + .detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client) + .instrument(info_span!("tenant_detach", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + )) + .await { match e { TenantStateError::SlotError(TenantSlotError::NotFound(_)) => { @@ -1265,14 +1527,20 @@ async fn put_tenant_location_config_handler( let location_conf = LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?; - state + // lazy==true queues up for activation or jumps the queue like normal when a compute connects, + // similar to at startup ordering. + let spawn_mode = if lazy { + tenant::SpawnMode::Lazy + } else { + tenant::SpawnMode::Eager + }; + + let tenant = state .tenant_manager - .upsert_location(tenant_shard_id, location_conf, flush, &ctx) - .await - // TODO: badrequest assumes the caller was asking for something unreasonable, but in - // principle we might have hit something like concurrent API calls to the same tenant, - // which is not a 400 but a 409. - .map_err(ApiError::BadRequest)?; + .upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx) + .await?; + let stripe_size = tenant.as_ref().map(|t| t.get_shard_stripe_size()); + let attached = tenant.is_some(); if let Some(_flush_ms) = flush { match state @@ -1291,6 +1559,137 @@ async fn put_tenant_location_config_handler( tracing::info!("No flush requested when configuring"); } + // This API returns a vector of pageservers where the tenant is attached: this is + // primarily for use in the sharding service. For compatibilty, we also return this + // when called directly on a pageserver, but the payload is always zero or one shards. + let mut response = TenantLocationConfigResponse { + shards: Vec::new(), + stripe_size: None, + }; + if attached { + response.shards.push(TenantShardLocation { + shard_id: tenant_shard_id, + node_id: state.conf.id, + }); + if tenant_shard_id.shard_count.count() > 1 { + // Stripe size should be set if we are attached + debug_assert!(stripe_size.is_some()); + response.stripe_size = stripe_size; + } + } + + json_response(StatusCode::OK, response) +} + +async fn list_location_config_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let slots = state.tenant_manager.list(); + let result = LocationConfigListResponse { + tenant_shards: slots + .into_iter() + .map(|(tenant_shard_id, slot)| { + let v = match slot { + TenantSlot::Attached(t) => Some(t.get_location_conf()), + TenantSlot::Secondary(s) => Some(s.get_location_conf()), + TenantSlot::InProgress(_) => None, + }; + (tenant_shard_id, v) + }) + .collect(), + }; + json_response(StatusCode::OK, result) +} + +async fn get_location_config_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let slot = state.tenant_manager.get(tenant_shard_id); + + let Some(slot) = slot else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + let result: Option = match slot { + TenantSlot::Attached(t) => Some(t.get_location_conf()), + TenantSlot::Secondary(s) => Some(s.get_location_conf()), + TenantSlot::InProgress(_) => None, + }; + + json_response(StatusCode::OK, result) +} + +// Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached +// (from all pageservers) as it invalidates consistency assumptions. +async fn tenant_time_travel_remote_storage_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let timestamp_raw = must_get_query_param(&request, "travel_to")?; + let timestamp = humantime::parse_rfc3339(×tamp_raw) + .with_context(|| format!("Invalid time for travel_to: {timestamp_raw:?}")) + .map_err(ApiError::BadRequest)?; + + let done_if_after_raw = must_get_query_param(&request, "done_if_after")?; + let done_if_after = humantime::parse_rfc3339(&done_if_after_raw) + .with_context(|| format!("Invalid time for done_if_after: {done_if_after_raw:?}")) + .map_err(ApiError::BadRequest)?; + + // This is just a sanity check to fend off naive wrong usages of the API: + // the tenant needs to be detached *everywhere* + let state = get_state(&request); + let we_manage_tenant = state.tenant_manager.manages_tenant_shard(tenant_shard_id); + if we_manage_tenant { + return Err(ApiError::BadRequest(anyhow!( + "Tenant {tenant_shard_id} is already attached at this pageserver" + ))); + } + + if timestamp > done_if_after { + return Err(ApiError::BadRequest(anyhow!( + "The done_if_after timestamp comes before the timestamp to recover to" + ))); + } + + tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}"); + + remote_timeline_client::upload::time_travel_recover_tenant( + &state.remote_storage, + &tenant_shard_id, + timestamp, + done_if_after, + &cancel, + ) + .await + .map_err(|e| match e { + TimeTravelError::BadInput(e) => { + warn!("bad input error: {e}"); + ApiError::BadRequest(anyhow!("bad input error")) + } + TimeTravelError::Unimplemented => { + ApiError::BadRequest(anyhow!("unimplemented for the configured remote storage")) + } + TimeTravelError::Cancelled => ApiError::InternalServerError(anyhow!("cancelled")), + TimeTravelError::TooManyVersions => { + ApiError::InternalServerError(anyhow!("too many versions in remote storage")) + } + TimeTravelError::Other(e) => { + warn!("internal error: {e}"); + ApiError::InternalServerError(anyhow!("internal error")) + } + })?; + json_response(StatusCode::OK, ()) } @@ -1301,14 +1700,42 @@ async fn handle_tenant_break( ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; - let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true) - .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; - - tenant.set_broken("broken from test".to_owned()).await; + let state = get_state(&r); + state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)? + .set_broken("broken from test".to_owned()) + .await; json_response(StatusCode::OK, ()) } +// Obtains an lsn lease on the given timeline. +async fn lsn_lease_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let lsn: Lsn = parse_query_param(&request, "lsn")? + .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + let result = timeline + .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx) + .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?; + + json_response(StatusCode::OK, result) +} + // Run GC immediately on given timeline. async fn timeline_gc_handler( mut request: Request, @@ -1321,13 +1748,7 @@ async fn timeline_gc_handler( let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = - mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; - let gc_result = wait_task_done - .await - .context("wait for gc task") - .map_err(ApiError::InternalServerError)? - .map_err(ApiError::InternalServerError)?; + let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; json_response(StatusCode::OK, gc_result) } @@ -1341,17 +1762,28 @@ async fn timeline_compact_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } + if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { + flags |= CompactFlags::ForceImageLayerCreation; + } + let wait_until_uploaded = + parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); + async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; timeline .compact(&cancel, flags, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; + if wait_until_uploaded { + timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + } json_response(StatusCode::OK, ()) } .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) @@ -1367,21 +1799,44 @@ async fn timeline_checkpoint_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); + let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } + if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { + flags |= CompactFlags::ForceImageLayerCreation; + } + let wait_until_uploaded = + parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); + async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; timeline .freeze_and_flush() .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| { + match e { + tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown, + other => ApiError::InternalServerError(other.into()), + + } + })?; timeline .compact(&cancel, flags, &ctx) .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; + .map_err(|e| + match e { + CompactionError::ShuttingDown => ApiError::ShuttingDown, + CompactionError::Other(e) => ApiError::InternalServerError(e) + } + )?; + + if wait_until_uploaded { + timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?; + } json_response(StatusCode::OK, ()) } @@ -1398,7 +1853,11 @@ async fn timeline_download_remote_layers_handler_post( let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; match timeline.spawn_download_all_remote_layers(body).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), @@ -1412,8 +1871,11 @@ async fn timeline_download_remote_layers_handler_get( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let state = get_state(&request); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; let info = timeline .get_download_all_remote_layers_task_info() .context("task never started since last pageserver process start") @@ -1421,17 +1883,78 @@ async fn timeline_download_remote_layers_handler_get( json_response(StatusCode::OK, info) } +async fn timeline_detach_ancestor_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + use crate::tenant::timeline::detach_ancestor::Options; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + + let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); + + async move { + let mut options = Options::default(); + + let rewrite_concurrency = + parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?; + let copy_concurrency = + parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?; + + [ + (&mut options.rewrite_concurrency, rewrite_concurrency), + (&mut options.copy_concurrency, copy_concurrency), + ] + .into_iter() + .filter_map(|(target, val)| val.map(|val| (target, val))) + .for_each(|(target, val)| *target = val); + + let state = get_state(&request); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); + let ctx = &ctx; + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let (_guard, prepared) = timeline + .prepare_to_detach_from_ancestor(&tenant, options, ctx) + .await?; + + let res = state + .tenant_manager + .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx) + .await; + + match res { + Ok(reparented_timelines) => { + let resp = pageserver_api::models::detach_ancestor::AncestorDetached { + reparented_timelines, + }; + + json_response(StatusCode::OK, resp) + } + Err(e) => Err(ApiError::InternalServerError( + e.context("timeline detach completion"), + )), + } + } + .instrument(span) + .await +} + async fn deletion_queue_flush( r: Request, cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&r); - if state.remote_storage.is_none() { - // Nothing to do if remote storage is disabled. - return json_response(StatusCode::OK, ()); - } - let execute = parse_query_param(&r, "execute")?.unwrap_or(false); let flush = async { @@ -1462,6 +1985,7 @@ async fn getpage_at_lsn_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); struct Key(crate::repository::Key); @@ -1480,7 +2004,7 @@ async fn getpage_at_lsn_handler( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let page = timeline.get(key.0, lsn, &ctx).await?; @@ -1503,19 +2027,22 @@ async fn timeline_collect_keyspace( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; + let state = get_state(&request); let at_lsn: Option = parse_query_param(&request, "at_lsn")?; async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?; let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); - let keys = timeline + let (dense_ks, sparse_ks) = timeline .collect_keyspace(at_lsn, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; - let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn }; + // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace. + // Therefore, we split dense/sparse keys in this API. + let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn }; json_response(StatusCode::OK, res) } @@ -1524,13 +2051,15 @@ async fn timeline_collect_keyspace( } async fn active_timeline_of_active_tenant( + tenant_manager: &TenantManager, tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result, ApiError> { - let tenant = mgr::get_tenant(tenant_shard_id, true)?; - tenant - .get_timeline(timeline_id, true) - .map_err(|e| ApiError::NotFound(e.into())) + let tenant = tenant_manager.get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + Ok(tenant.get_timeline(timeline_id, true)?) } async fn always_panic_handler( @@ -1588,19 +2117,13 @@ async fn disk_usage_eviction_run( }; let state = get_state(&r); - - let Some(storage) = state.remote_storage.as_ref() else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "remote storage not configured, cannot run eviction iteration" - ))); - }; - - let state = state.disk_usage_eviction_state.clone(); + let eviction_state = state.disk_usage_eviction_state.clone(); let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( - &state, - storage, + &eviction_state, + &state.remote_storage, usage, + &state.tenant_manager, config.eviction_order, &cancel, ) @@ -1628,19 +2151,136 @@ async fn secondary_upload_handler( json_response(StatusCode::OK, ()) } +async fn tenant_scan_remote_handler( + request: Request, + cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + + let mut response = TenantScanRemoteStorageResponse::default(); + + let (shards, _other_keys) = + list_remote_tenant_shards(&state.remote_storage, tenant_id, cancel.clone()) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; + + for tenant_shard_id in shards { + let (timeline_ids, _other_keys) = + list_remote_timelines(&state.remote_storage, tenant_shard_id, cancel.clone()) + .await + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; + + let mut generation = Generation::none(); + for timeline_id in timeline_ids { + match download_index_part( + &state.remote_storage, + &tenant_shard_id, + &timeline_id, + Generation::MAX, + &cancel, + ) + .instrument(info_span!("download_index_part", + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + %timeline_id)) + .await + { + Ok((index_part, index_generation)) => { + tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)", + index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn()); + generation = std::cmp::max(generation, index_generation); + } + Err(DownloadError::NotFound) => { + // This is normal for tenants that were created with multiple shards: they have an unsharded path + // containing the timeline's initdb tarball but no index. Otherwise it is a bit strange. + tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping"); + continue; + } + Err(e) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + } + }; + } + + response.shards.push(TenantScanRemoteStorageShard { + tenant_shard_id, + generation: generation.into(), + }); + } + + if response.shards.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(), + )); + } + + json_response(StatusCode::OK, response) +} + async fn secondary_download_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let state = get_state(&request); let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; - state - .secondary_controller - .download_tenant(tenant_shard_id) - .await - .map_err(ApiError::InternalServerError)?; + let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis); - json_response(StatusCode::OK, ()) + // We don't need this to issue the download request, but: + // - it enables us to cleanly return 404 if we get a request for an absent shard + // - we will use this to provide status feedback in the response + let Some(secondary_tenant) = state + .tenant_manager + .get_secondary_tenant_shard(tenant_shard_id) + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(), + )); + }; + + let timeout = wait.unwrap_or(Duration::MAX); + + let status = match tokio::time::timeout( + timeout, + state.secondary_controller.download_tenant(tenant_shard_id), + ) + .await + { + // Download job ran to completion. + Ok(Ok(())) => StatusCode::OK, + // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered + // okay. We could get an error here in the unlikely edge case that the tenant + // was detached between our check above and executing the download job. + Ok(Err(e)) => return Err(ApiError::InternalServerError(e)), + // A timeout is not an error: we have started the download, we're just not done + // yet. The caller will get a response body indicating status. + Err(_) => StatusCode::ACCEPTED, + }; + + let progress = secondary_tenant.progress.lock().unwrap().clone(); + + json_response(status, progress) +} + +async fn secondary_status_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let state = get_state(&request); + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + + let Some(secondary_tenant) = state + .tenant_manager + .get_secondary_tenant_shard(tenant_shard_id) + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(), + )); + }; + + let progress = secondary_tenant.progress.lock().unwrap().clone(); + + json_response(StatusCode::OK, progress) } async fn handler_404(_: Request) -> Result, ApiError> { @@ -1683,6 +2323,256 @@ async fn post_tracing_event_handler( json_response(StatusCode::OK, ()) } +async fn force_aux_policy_switch_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?; + let policy: AuxFilePolicy = json_request(&mut r).await?; + + let state = get_state(&r); + + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + timeline + .do_switch_aux_policy(policy) + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +async fn put_io_engine_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?; + crate::virtual_file::io_engine::set(kind); + json_response(StatusCode::OK, ()) +} + +/// Polled by control plane. +/// +/// See [`crate::utilization`]. +async fn get_utilization( + r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + fail::fail_point!("get-utilization-http-handler", |_| { + Err(ApiError::ResourceUnavailable("failpoint".into())) + }); + + // this probably could be completely public, but lets make that change later. + check_permission(&r, None)?; + + let state = get_state(&r); + let mut g = state.latest_utilization.lock().await; + + let regenerate_every = Duration::from_secs(1); + let still_valid = g + .as_ref() + .is_some_and(|(captured_at, _)| captured_at.elapsed() < regenerate_every); + + // avoid needless statvfs calls even though those should be non-blocking fast. + // regenerate at most 1Hz to allow polling at any rate. + if !still_valid { + let path = state.conf.tenants_path(); + let doc = crate::utilization::regenerate(path.as_std_path()) + .map_err(ApiError::InternalServerError)?; + + let mut buf = Vec::new(); + serde_json::to_writer(&mut buf, &doc) + .context("serialize") + .map_err(ApiError::InternalServerError)?; + + let body = bytes::Bytes::from(buf); + + *g = Some((std::time::Instant::now(), body)); + } + + // hyper 0.14 doesn't yet have Response::clone so this is a bit of extra legwork + let cached = g.as_ref().expect("just set").1.clone(); + + Response::builder() + .header(hyper::http::header::CONTENT_TYPE, "application/json") + // thought of using http date header, but that is second precision which does not give any + // debugging aid + .status(StatusCode::OK) + .body(hyper::Body::from(cached)) + .context("build response") + .map_err(ApiError::InternalServerError) +} + +async fn list_aux_files( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let body: ListAuxFilesRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + let files = timeline.list_aux_files(body.lsn, &ctx).await?; + json_response(StatusCode::OK, files) +} + +async fn perf_info( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let result = timeline.perf_info().await; + + json_response(StatusCode::OK, result) +} + +async fn ingest_aux_files( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let body: IngestAuxFilesRequest = json_request(&mut request).await?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + let mut modification = timeline.begin_modification( + Lsn(timeline.get_last_record_lsn().0 + 8), /* advance LSN by 8 */ + ); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); + for (fname, content) in body.aux_files { + modification + .put_file(&fname, content.as_bytes(), &ctx) + .await + .map_err(ApiError::InternalServerError)?; + } + modification + .commit(&ctx) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) +} + +/// Report on the largest tenants on this pageserver, for the storage controller to identify +/// candidates for splitting +async fn post_top_tenants( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let request: TopTenantShardsRequest = json_request(&mut r).await?; + let state = get_state(&r); + + fn get_size_metric(sizes: &TopTenantShardItem, order_by: &TenantSorting) -> u64 { + match order_by { + TenantSorting::ResidentSize => sizes.resident_size, + TenantSorting::MaxLogicalSize => sizes.max_logical_size, + } + } + + #[derive(Eq, PartialEq)] + struct HeapItem { + metric: u64, + sizes: TopTenantShardItem, + } + + impl PartialOrd for HeapItem { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + /// Heap items have reverse ordering on their metric: this enables using BinaryHeap, which + /// supports popping the greatest item but not the smallest. + impl Ord for HeapItem { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + Reverse(self.metric).cmp(&Reverse(other.metric)) + } + } + + let mut top_n: BinaryHeap = BinaryHeap::with_capacity(request.limit); + + // FIXME: this is a lot of clones to take this tenant list + for (tenant_shard_id, tenant_slot) in state.tenant_manager.list() { + if let Some(shards_lt) = request.where_shards_lt { + // Ignore tenants which already have >= this many shards + if tenant_shard_id.shard_count >= shards_lt { + continue; + } + } + + let sizes = match tenant_slot { + TenantSlot::Attached(tenant) => tenant.get_sizes(), + TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => { + continue; + } + }; + let metric = get_size_metric(&sizes, &request.order_by); + + if let Some(gt) = request.where_gt { + // Ignore tenants whose metric is <= the lower size threshold, to do less sorting work + if metric <= gt { + continue; + } + }; + + match top_n.peek() { + None => { + // Top N list is empty: candidate becomes first member + top_n.push(HeapItem { metric, sizes }); + } + Some(i) if i.metric > metric && top_n.len() < request.limit => { + // Lowest item in list is greater than our candidate, but we aren't at limit yet: push to end + top_n.push(HeapItem { metric, sizes }); + } + Some(i) if i.metric > metric => { + // List is at limit and lowest value is greater than our candidate, drop it. + } + Some(_) => top_n.push(HeapItem { metric, sizes }), + } + + while top_n.len() > request.limit { + top_n.pop(); + } + } + + json_response( + StatusCode::OK, + TopTenantShardsResponse { + shards: top_n.into_iter().map(|i| i.sizes).collect(), + }, + ) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -1697,6 +2587,16 @@ where R: std::future::Future, ApiError>> + Send + 'static, H: FnOnce(Request, CancellationToken) -> R + Send + Sync + 'static, { + if request.uri() != &"/v1/failpoints".parse::().unwrap() { + fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable( + "failpoint".into() + ))); + + fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError( + anyhow::anyhow!("failpoint") + ))); + } + // Spawn a new task to handle the request, to protect the handler from unexpected // async cancellations. Most pageserver functions are not async cancellation safe. // We arm a drop-guard, so that if Hyper drops the Future, we signal the task @@ -1808,6 +2708,7 @@ pub fn make_router( Ok(router .data(state) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) @@ -1829,12 +2730,25 @@ pub fn make_router( .put("/v1/tenant/config", |r| { api_handler(r, update_tenant_config_handler) }) + .put("/v1/tenant/:tenant_shard_id/shard_split", |r| { + api_handler(r, tenant_shard_split_handler) + }) .get("/v1/tenant/:tenant_shard_id/config", |r| { api_handler(r, get_tenant_config_handler) }) .put("/v1/tenant/:tenant_shard_id/location_config", |r| { api_handler(r, put_tenant_location_config_handler) }) + .get("/v1/location_config", |r| { + api_handler(r, list_location_config_handler) + }) + .get("/v1/location_config/:tenant_shard_id", |r| { + api_handler(r, get_location_config_handler) + }) + .put( + "/v1/tenant/:tenant_shard_id/time_travel_remote_storage", + |r| api_handler(r, tenant_time_travel_remote_storage_handler), + ) .get("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_list_handler) }) @@ -1856,6 +2770,10 @@ pub fn make_router( .post("/v1/tenant/:tenant_id/ignore", |r| { api_handler(r, tenant_ignore_handler) }) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive", + |r| api_handler(r, timeline_preserve_initdb_handler), + ) .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) @@ -1867,6 +2785,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease", + |r| api_handler(r, lsn_lease_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", |r| api_handler(r, timeline_gc_handler), @@ -1887,6 +2809,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_get), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor", + |r| api_handler(r, timeline_detach_ancestor_handler), + ) .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_delete_handler) }) @@ -1905,12 +2831,18 @@ pub fn make_router( .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| { api_handler(r, secondary_upload_handler) }) + .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| { + api_handler(r, tenant_scan_remote_handler) + }) .put("/v1/disk_usage_eviction/run", |r| { api_handler(r, disk_usage_eviction_run) }) .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) + .get("/v1/tenant/:tenant_shard_id/secondary/status", |r| { + api_handler(r, secondary_status_handler) + }) .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| { api_handler(r, secondary_download_handler) }) @@ -1927,7 +2859,26 @@ pub fn make_router( ) .get( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", - |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace), + |r| api_handler(r, timeline_collect_keyspace), + ) + .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", + |r| api_handler(r, force_aux_policy_switch_handler), + ) + .get("/v1/utilization", |r| api_handler(r, get_utilization)) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files", + |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files), + ) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/list_aux_files", + |r| testing_api_handler("list_aux_files", r, list_aux_files), + ) + .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants)) + .post( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info", + |r| testing_api_handler("perf_info", r, perf_info), ) .any(handler_404)) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index d66df36b3a..ed409d3130 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -2,28 +2,21 @@ //! Import data and WAL from a PostgreSQL data directory and WAL segments into //! a neon Timeline. //! -use std::io::SeekFrom; use std::path::{Path, PathBuf}; use anyhow::{bail, ensure, Context, Result}; -use async_compression::tokio::bufread::ZstdDecoder; -use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; -use nix::NixPath; -use tokio::fs::{File, OpenOptions}; -use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use pageserver_api::key::rel_block_to_key; +use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; -use tokio_tar::Builder; -use tokio_tar::HeaderMode; use tracing::*; use walkdir::WalkDir; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; -use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::Timeline; use crate::walingest::WalIngest; use crate::walrecord::DecodedWALRecord; @@ -178,7 +171,10 @@ async fn import_rel( let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { - modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + let key = rel_block_to_key(rel, blknum); + if modification.tline.get_shard_identity().is_key_local(&key) { + modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + } } // TODO: UnexpectedEof is expected @@ -633,65 +629,3 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result reader.read_to_end(&mut buf).await?; Ok(Bytes::from(buf)) } - -pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> { - let file = OpenOptions::new() - .create(true) - .truncate(true) - .read(true) - .write(true) - .open(&tmp_path) - .await - .with_context(|| format!("tempfile creation {tmp_path}"))?; - - let mut paths = Vec::new(); - for entry in WalkDir::new(pgdata_path) { - let entry = entry?; - let metadata = entry.metadata().expect("error getting dir entry metadata"); - // Also allow directories so that we also get empty directories - if !(metadata.is_file() || metadata.is_dir()) { - continue; - } - let path = entry.into_path(); - paths.push(path); - } - // Do a sort to get a more consistent listing - paths.sort_unstable(); - let zstd = ZstdEncoder::with_quality_and_params( - file, - Level::Default, - &[CParameter::enable_long_distance_matching(true)], - ); - let mut builder = Builder::new(zstd); - // Use reproducible header mode - builder.mode(HeaderMode::Deterministic); - for path in paths { - let rel_path = path.strip_prefix(pgdata_path)?; - if rel_path.is_empty() { - // The top directory should not be compressed, - // the tar crate doesn't like that - continue; - } - builder.append_path_with_name(&path, rel_path).await?; - } - let mut zstd = builder.into_inner().await?; - zstd.shutdown().await?; - let mut compressed = zstd.into_inner(); - let compressed_len = compressed.metadata().await?.len(); - const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; - if compressed_len > INITDB_TAR_ZST_WARN_LIMIT { - warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."); - } - compressed.seek(SeekFrom::Start(0)).await?; - Ok((compressed, compressed_len)) -} - -pub async fn extract_tar_zst( - pgdata_path: &Utf8Path, - tar_zst: impl AsyncBufRead + Unpin, -) -> Result<()> { - let tar = Box::pin(ZstdDecoder::new(tar_zst)); - let mut archive = Archive::new(tar); - archive.unpack(pgdata_path).await?; - Ok(()) -} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 26070e0cc1..c69fb8c83b 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,3 +1,4 @@ +#![recursion_limit = "300"] #![deny(clippy::undocumented_unsafe_blocks)] mod auth; @@ -11,15 +12,18 @@ pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub use pageserver_api::keyspace; +pub mod aux_file; pub mod metrics; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod repository; +pub mod span; pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; pub mod trace; +pub mod utilization; pub mod virtual_file; pub mod walingest; pub mod walrecord; @@ -28,6 +32,7 @@ pub mod walredo; use crate::task_mgr::TaskKind; use camino::Utf8Path; use deletion_queue::DeletionQueue; +use tenant::mgr::TenantManager; use tracing::info; /// Current storage format version @@ -50,7 +55,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; #[tracing::instrument(skip_all, fields(%exit_code))] -pub async fn shutdown_pageserver(deletion_queue: Option, exit_code: i32) { +pub async fn shutdown_pageserver( + tenant_manager: &TenantManager, + mut deletion_queue: DeletionQueue, + exit_code: i32, +) { use std::time::Duration; // Shut down the libpq endpoint task. This prevents new connections from // being accepted. @@ -64,7 +73,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option, exit_cod // Shut down all the tenants. This flushes everything to disk and kills // the checkpoint and GC tasks. timed( - tenant::mgr::shutdown_all_tenants(), + tenant_manager.shutdown(), "shutdown all tenants", Duration::from_secs(5), ) @@ -80,9 +89,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option, exit_cod .await; // Best effort to persist any outstanding deletions, to avoid leaking objects - if let Some(mut deletion_queue) = deletion_queue { - deletion_queue.shutdown(Duration::from_secs(5)).await; - } + deletion_queue.shutdown(Duration::from_secs(5)).await; // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. @@ -105,33 +112,29 @@ pub async fn shutdown_pageserver(deletion_queue: Option, exit_cod std::process::exit(exit_code); } -/// The name of the metadata file pageserver creates per timeline. -/// Full path: `tenants//timelines//metadata`. -pub const METADATA_FILE_NAME: &str = "metadata"; +/// Per-tenant configuration file. +/// Full path: `tenants//config`. +pub(crate) const TENANT_CONFIG_NAME: &str = "config"; /// Per-tenant configuration file. /// Full path: `tenants//config`. -pub const TENANT_CONFIG_NAME: &str = "config"; - -/// Per-tenant configuration file. -/// Full path: `tenants//config`. -pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; +pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1"; /// Per-tenant copy of their remote heatmap, downloaded into the local /// tenant path while in secondary mode. -pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; +pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json"; /// A suffix used for various temporary files. Any temporary files found in the /// data directory at pageserver startup can be automatically removed. -pub const TEMP_FILE_SUFFIX: &str = "___temp"; +pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp"; /// A marker file to mark that a timeline directory was not fully initialized. /// If a timeline directory with this marker is encountered at pageserver startup, /// the timeline directory and the marker file are both removed. /// Full path: `tenants//timelines/___uninit`. -pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; +pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; -pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; +pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; /// A marker file to prevent pageserver from loading a certain tenant on restart. /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding @@ -158,23 +161,14 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool { // from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once // from the name. -pub fn is_uninit_mark(path: &Utf8Path) -> bool { +pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool { ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX) } -pub fn is_delete_mark(path: &Utf8Path) -> bool { +pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool { ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX) } -fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool { - if let Some(e) = e.io_error() { - if e.kind() == std::io::ErrorKind::NotFound { - return true; - } - } - false -} - /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by /// blocking. /// diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 6f4431c3cf..e8a1e063c5 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,17 +1,17 @@ use enum_map::EnumMap; -use metrics::metric_vec_duration::DurationResultObserver; use metrics::{ register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec, - Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec, - IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, + Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; use strum::{EnumCount, IntoEnumIterator, VariantNames}; use strum_macros::{EnumVariantNames, IntoStaticStr}; -use utils::id::{TenantId, TimelineId}; +use tracing::warn; +use utils::id::TimelineId; /// Prometheus histogram buckets (in seconds) for operations in the critical /// path. In other words, operations that directly affect that latency of user @@ -51,6 +51,9 @@ pub(crate) enum StorageTimeOperation { #[strum(serialize = "gc")] Gc, + #[strum(serialize = "find gc cutoffs")] + FindGcCutoffs, + #[strum(serialize = "create tenant")] CreateTenant, } @@ -59,7 +62,7 @@ pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| register_counter_vec!( "pageserver_storage_operations_seconds_sum", "Total time spent on storage operations with operation, tenant and timeline dimensions", - &["operation", "tenant_id", "timeline_id"], + &["operation", "tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); @@ -68,7 +71,7 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::n register_int_counter_vec!( "pageserver_storage_operations_seconds_count", "Count of storage operations with operation, tenant and timeline dimensions", - &["operation", "tenant_id", "timeline_id"], + &["operation", "tenant_id", "shard_id", "timeline_id"], ) .expect("failed to define a metric") }); @@ -86,41 +89,58 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { +pub(crate) static READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { register_histogram!( - "pageserver_read_num_fs_layers", - "Number of persistent layers accessed for processing a read request, including those in the cache", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], + "pageserver_layers_visited_per_read_global", + "Number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + ) + .expect("failed to define a metric") +}); + +pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_visited_per_vectored_read_global", + "Average number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], ) .expect("failed to define a metric") }); // Metrics collected on operations on the storage repository. +#[derive( + Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr, +)] +pub(crate) enum GetKind { + Singular, + Vectored, +} pub(crate) struct ReconstructTimeMetrics { - ok: Histogram, - err: Histogram, + singular: Histogram, + vectored: Histogram, } pub(crate) static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value (reconstruct a page from deltas)", - &["result"], + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric"); + ReconstructTimeMetrics { - ok: inner.get_metric_with_label_values(&["ok"]).unwrap(), - err: inner.get_metric_with_label_values(&["err"]).unwrap(), + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), } }); impl ReconstructTimeMetrics { - pub(crate) fn for_result(&self, result: &Result) -> &Histogram { - match result { - Ok(_) => &self.ok, - Err(_) => &self.err, + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, } } } @@ -133,13 +153,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::n .expect("failed to define a metric") }); -pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { - register_histogram!( +pub(crate) struct ReconstructDataTimeMetrics { + singular: Histogram, + vectored: Histogram, +} + +impl ReconstructDataTimeMetrics { + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, + } + } +} + +pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( "pageserver_getpage_get_reconstruct_data_seconds", "Time spent in get_reconstruct_value_data", + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) - .expect("failed to define a metric") + .expect("failed to define a metric"); + + ReconstructDataTimeMetrics { + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), + } }); pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { @@ -150,6 +190,113 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) struct GetVectoredLatency { + map: EnumMap>, +} + +#[allow(dead_code)] +pub(crate) struct ScanLatency { + map: EnumMap>, +} + +impl GetVectoredLatency { + // Only these task types perform vectored gets. Filter all other tasks out to reduce total + // cardinality of the metric. + const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler]; + + pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { + self.map[task_kind].as_ref() + } +} + +impl ScanLatency { + // Only these task types perform vectored gets. Filter all other tasks out to reduce total + // cardinality of the metric. + const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler]; + + pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { + self.map[task_kind].as_ref() + } +} + +pub(crate) struct ScanLatencyOngoingRecording<'a> { + parent: &'a Histogram, + start: std::time::Instant, +} + +impl<'a> ScanLatencyOngoingRecording<'a> { + pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> { + let start = Instant::now(); + ScanLatencyOngoingRecording { parent, start } + } + + pub(crate) fn observe(self, throttled: Option) { + let elapsed = self.start.elapsed(); + let ex_throttled = if let Some(throttled) = throttled { + elapsed.checked_sub(throttled) + } else { + Some(elapsed) + }; + if let Some(ex_throttled) = ex_throttled { + self.parent.observe(ex_throttled.as_secs_f64()); + } else { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!("error deducting time spent throttled; this message is logged at a global rate limit"); + }); + } + } +} + +pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( + "pageserver_get_vectored_seconds", + "Time spent in get_vectored, excluding time spent in timeline_get_throttle.", + &["task_kind"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric"); + + GetVectoredLatency { + map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { + let task_kind = ::from_usize(task_kind_idx); + + if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) { + let task_kind = task_kind.into(); + Some(inner.with_label_values(&[task_kind])) + } else { + None + } + })), + } +}); + +pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( + "pageserver_scan_seconds", + "Time spent in scan, excluding time spent in timeline_get_throttle.", + &["task_kind"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric"); + + ScanLatency { + map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { + let task_kind = ::from_usize(task_kind_idx); + + if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) { + let task_kind = task_kind.into(); + Some(inner.with_label_values(&[task_kind])) + } else { + None + } + })), + } +}); + pub(crate) struct PageCacheMetricsForTaskKind { pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, @@ -337,15 +484,6 @@ pub(crate) mod page_cache_eviction_metrics { } } -pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy = Lazy::new(|| { - register_histogram!( - "pageserver_page_cache_acquire_pinned_slot_seconds", - "Time spent acquiring a pinned slot in the page cache", - CRITICAL_OP_BUCKETS.into(), - ) - .expect("failed to define a metric") -}); - static PAGE_CACHE_ERRORS: Lazy = Lazy::new(|| { register_int_counter_vec!( "page_cache_errors_total", @@ -382,7 +520,16 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_last_record_lsn", "Last record LSN grouped by timeline", - &["tenant_id", "timeline_id"] + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static STANDBY_HORIZON: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_standby_horizon", + "Standby apply LSN for which GC is hold off, by timeline.", + &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); @@ -391,7 +538,7 @@ static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_resident_physical_size", "The size of the layer files present in the pageserver's filesystem.", - &["tenant_id", "timeline_id"] + &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); @@ -407,9 +554,9 @@ pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_remote_physical_size", - "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.", + "The size of the layer files present in the remote storage that are listed in the remote index_part.json.", // Corollary: If any files are missing from the index part, they won't be included here. - &["tenant_id", "timeline_id"] + &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); @@ -442,11 +589,20 @@ static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_current_logical_size", "Current logical size grouped by timeline", - &["tenant_id", "timeline_id"] + &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define current logical size metric") }); +static AUX_FILE_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_aux_file_estimated_size", + "The size of all aux files for a timeline in aux file v2 store.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + pub(crate) mod initial_logical_size { use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; use once_cell::sync::Lazy; @@ -574,6 +730,15 @@ pub(crate) mod initial_logical_size { }); } +static DIRECTORY_ENTRIES_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_directory_entries_count", + "Sum of the entries in pageserver-stored directory listings", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + pub(crate) static TENANT_STATE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_states_count", @@ -591,7 +756,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_broken_tenants_count", "Set of broken tenants", - &["tenant_id"] + &["tenant_id", "shard_id"] ) .expect("Failed to register pageserver_tenant_states_count metric") }); @@ -605,26 +770,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(| .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric") }); -// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, -// or in testing they estimate how much we would upload if we did. -static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_created_persistent_files_total", - "Number of files created that are meant to be uploaded to cloud storage", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - -static PERSISTENT_BYTES_WRITTEN: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "pageserver_written_persistent_bytes_total", - "Total bytes written that are meant to be uploaded to cloud storage", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - pub(crate) static EVICTION_ITERATION_DURATION: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_eviction_iteration_duration_seconds_global", @@ -639,7 +784,7 @@ static EVICTIONS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_evictions", "Number of layers evicted from the pageserver", - &["tenant_id", "timeline_id"] + &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); @@ -682,6 +827,14 @@ pub static STARTUP_IS_LOADING: Lazy = Lazy::new(|| { .expect("Failed to register pageserver_startup_is_loading") }); +pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_timeline_ephemeral_bytes", + "Total number of bytes in ephemeral layers, summed for all timelines. Approximate, lazily updated." + ) + .expect("Failed to register metric") +}); + /// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things /// like how long it took to load. /// @@ -936,11 +1089,12 @@ pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_io_operations_bytes_total", "Total amount of bytes read/written in IO operations", - &["operation", "tenant_id", "timeline_id"] + &["operation", "tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") }); +#[cfg(not(test))] pub(crate) mod virtual_file_descriptor_cache { use super::*; @@ -960,6 +1114,20 @@ pub(crate) mod virtual_file_descriptor_cache { // ``` } +#[cfg(not(test))] +pub(crate) mod virtual_file_io_engine { + use super::*; + + pub(crate) static KIND: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_virtual_file_io_engine_kind", + "The configured io engine for VirtualFile", + &["kind"], + ) + .unwrap() + }); +} + #[derive(Debug)] struct GlobalAndPerTimelineHistogram { global: Histogram, @@ -973,15 +1141,39 @@ impl GlobalAndPerTimelineHistogram { } } -struct GlobalAndPerTimelineHistogramTimer<'a> { +struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { h: &'a GlobalAndPerTimelineHistogram, + ctx: &'c RequestContext, start: std::time::Instant, + op: SmgrQueryType, } -impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> { +impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { fn drop(&mut self) { let elapsed = self.start.elapsed(); - self.h.observe(elapsed.as_secs_f64()); + let ex_throttled = self + .ctx + .micros_spent_throttled + .close_and_checked_sub_from(elapsed); + let ex_throttled = match ex_throttled { + Ok(res) => res, + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy>> = + Lazy::new(|| { + Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { + RateLimit::new(Duration::from_secs(10)) + }))) + }); + let mut guard = LOGGED.lock().unwrap(); + let rate_limit = &mut guard[self.op]; + rate_limit.call(|| { + warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit"); + }); + elapsed + } + }; + self.h.observe(ex_throttled.as_secs_f64()); } } @@ -993,6 +1185,7 @@ impl<'a> Drop for GlobalAndPerTimelineHistogramTimer<'a> { strum_macros::EnumCount, strum_macros::EnumIter, strum_macros::FromRepr, + enum_map::Enum, )] #[strum(serialize_all = "snake_case")] pub enum SmgrQueryType { @@ -1000,6 +1193,7 @@ pub enum SmgrQueryType { GetRelSize, GetPageAtLsn, GetDbSize, + GetSlruSegment, } #[derive(Debug)] @@ -1011,7 +1205,7 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_smgr_query_seconds", "Time spent on smgr query handling, aggegated by query type and tenant/timeline.", - &["smgr_query_type", "tenant_id", "timeline_id"], + &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") @@ -1078,8 +1272,9 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy = Lazy::new(|| { }); impl SmgrQueryTimePerTimeline { - pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self { - let tenant_id = tenant_id.to_string(); + pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_slug = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); let metrics = std::array::from_fn(|i| { let op = SmgrQueryType::from_repr(i).unwrap(); @@ -1087,7 +1282,7 @@ impl SmgrQueryTimePerTimeline { .get_metric_with_label_values(&[op.into()]) .unwrap(); let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE - .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id]) + .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id]) .unwrap(); GlobalAndPerTimelineHistogram { global, @@ -1096,29 +1291,60 @@ impl SmgrQueryTimePerTimeline { }); Self { metrics } } - pub(crate) fn start_timer(&self, op: SmgrQueryType) -> impl Drop + '_ { + pub(crate) fn start_timer<'c: 'a, 'a>( + &'a self, + op: SmgrQueryType, + ctx: &'c RequestContext, + ) -> impl Drop + '_ { let metric = &self.metrics[op as usize]; + let start = Instant::now(); + match ctx.micros_spent_throttled.open() { + Ok(()) => (), + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy>> = + Lazy::new(|| { + Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { + RateLimit::new(Duration::from_secs(10)) + }))) + }); + let mut guard = LOGGED.lock().unwrap(); + let rate_limit = &mut guard[op]; + rate_limit.call(|| { + warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); + }); + } + } GlobalAndPerTimelineHistogramTimer { h: metric, - start: std::time::Instant::now(), + ctx, + start, + op, } } } #[cfg(test)] mod smgr_query_time_tests { + use pageserver_api::shard::TenantShardId; use strum::IntoEnumIterator; use utils::id::{TenantId, TimelineId}; + use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + }; + // Regression test, we used hard-coded string constants before using an enum. #[test] fn op_label_name() { use super::SmgrQueryType::*; - let expect: [(super::SmgrQueryType, &'static str); 4] = [ + let expect: [(super::SmgrQueryType, &'static str); 5] = [ (GetRelExists, "get_rel_exists"), (GetRelSize, "get_rel_size"), (GetPageAtLsn, "get_page_at_lsn"), (GetDbSize, "get_db_size"), + (GetSlruSegment, "get_slru_segment"), ]; for (op, expect) in expect { let actual: &'static str = op.into(); @@ -1133,7 +1359,10 @@ mod smgr_query_time_tests { for op in &ops { let tenant_id = TenantId::generate(); let timeline_id = TimelineId::generate(); - let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id); + let metrics = super::SmgrQueryTimePerTimeline::new( + &TenantShardId::unsharded(tenant_id), + &timeline_id, + ); let get_counts = || { let global: u64 = ops @@ -1154,7 +1383,8 @@ mod smgr_query_time_tests { let (pre_global, pre_per_tenant_timeline) = get_counts(); assert_eq!(pre_per_tenant_timeline, 0); - let timer = metrics.start_timer(*op); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + let timer = metrics.start_timer(*op, &ctx); drop(timer); let (post_global, post_per_tenant_timeline) = get_counts(); @@ -1188,11 +1418,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(| }) }); -impl DurationResultObserver for BasebackupQueryTime { - fn observe_result(&self, res: &Result, duration: std::time::Duration) { +pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> { + parent: &'a BasebackupQueryTime, + ctx: &'c RequestContext, + start: std::time::Instant, +} + +impl BasebackupQueryTime { + pub(crate) fn start_recording<'c: 'a, 'a>( + &'a self, + ctx: &'c RequestContext, + ) -> BasebackupQueryTimeOngoingRecording<'_, '_> { + let start = Instant::now(); + match ctx.micros_spent_throttled.open() { + Ok(()) => (), + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); + }); + } + } + BasebackupQueryTimeOngoingRecording { + parent: self, + ctx, + start, + } + } +} + +impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { + pub(crate) fn observe(self, res: &Result) { + let elapsed = self.start.elapsed(); + let ex_throttled = self + .ctx + .micros_spent_throttled + .close_and_checked_sub_from(elapsed); + let ex_throttled = match ex_throttled { + Ok(ex_throttled) => ex_throttled, + Err(error) => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit"); + }); + elapsed + } + }; let label_value = if res.is_ok() { "ok" } else { "error" }; - let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap(); - metric.observe(duration.as_secs_f64()); + let metric = self + .parent + .0 + .get_metric_with_label_values(&[label_value]) + .unwrap(); + metric.observe(ex_throttled.as_secs_f64()); } } @@ -1207,78 +1491,119 @@ pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { // remote storage metrics -/// NB: increment _after_ recording the current value into [`REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST`]. -static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_remote_timeline_client_calls_unfinished", - "Number of ongoing calls to remote timeline client. \ - Used to populate pageserver_remote_timeline_client_calls_started. \ - This metric is not useful for sampling from Prometheus, but useful in tests.", - &["tenant_id", "timeline_id", "file_kind", "op_kind"], - ) - .expect("failed to define a metric") -}); - -static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy = Lazy::new(|| { - register_histogram_vec!( +static REMOTE_TIMELINE_CLIENT_CALLS: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( "pageserver_remote_timeline_client_calls_started", - "When calling a remote timeline client method, we record the current value \ - of the calls_unfinished gauge in this histogram. Plot the histogram \ - over time in a heatmap to visualize how many operations were ongoing \ - at a given instant. It gives you a better idea of the queue depth \ - than plotting the gauge directly, since operations may complete faster \ - than the sampling interval.", - &["file_kind", "op_kind"], - // The calls_unfinished gauge is an integer gauge, hence we have integer buckets. - vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0], + "Number of started calls to remote timeline client.", + "pageserver_remote_timeline_client_calls_finished", + "Number of finshed calls to remote timeline client.", + &[ + "tenant_id", + "shard_id", + "timeline_id", + "file_kind", + "op_kind" + ], ) - .expect("failed to define a metric") + .unwrap() }); -static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( +static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy = + Lazy::new(|| { + register_int_counter_vec!( "pageserver_remote_timeline_client_bytes_started", "Incremented by the number of bytes associated with a remote timeline client operation. \ The increment happens when the operation is scheduled.", - &["tenant_id", "timeline_id", "file_kind", "op_kind"], + &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"], ) - .expect("failed to define a metric") -}); + .expect("failed to define a metric") + }); static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_remote_timeline_client_bytes_finished", "Incremented by the number of bytes associated with a remote timeline client operation. \ The increment happens when the operation finishes (regardless of success/failure/shutdown).", - &["tenant_id", "timeline_id", "file_kind", "op_kind"], + &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"], ) .expect("failed to define a metric") }); pub(crate) struct TenantManagerMetrics { - pub(crate) tenant_slots: UIntGauge, + tenant_slots_attached: UIntGauge, + tenant_slots_secondary: UIntGauge, + tenant_slots_inprogress: UIntGauge, pub(crate) tenant_slot_writes: IntCounter, pub(crate) unexpected_errors: IntCounter, } +impl TenantManagerMetrics { + /// Helpers for tracking slots. Note that these do not track the lifetime of TenantSlot objects + /// exactly: they track the lifetime of the slots _in the tenant map_. + pub(crate) fn slot_inserted(&self, slot: &TenantSlot) { + match slot { + TenantSlot::Attached(_) => { + self.tenant_slots_attached.inc(); + } + TenantSlot::Secondary(_) => { + self.tenant_slots_secondary.inc(); + } + TenantSlot::InProgress(_) => { + self.tenant_slots_inprogress.inc(); + } + } + } + + pub(crate) fn slot_removed(&self, slot: &TenantSlot) { + match slot { + TenantSlot::Attached(_) => { + self.tenant_slots_attached.dec(); + } + TenantSlot::Secondary(_) => { + self.tenant_slots_secondary.dec(); + } + TenantSlot::InProgress(_) => { + self.tenant_slots_inprogress.dec(); + } + } + } + + #[cfg(all(debug_assertions, not(test)))] + pub(crate) fn slots_total(&self) -> u64 { + self.tenant_slots_attached.get() + + self.tenant_slots_secondary.get() + + self.tenant_slots_inprogress.get() + } +} + pub(crate) static TENANT_MANAGER: Lazy = Lazy::new(|| { - TenantManagerMetrics { - tenant_slots: register_uint_gauge!( + let tenant_slots = register_uint_gauge_vec!( "pageserver_tenant_manager_slots", "How many slots currently exist, including all attached, secondary and in-progress operations", + &["mode"] ) - .expect("failed to define a metric"), - tenant_slot_writes: register_int_counter!( - "pageserver_tenant_manager_slot_writes", - "Writes to a tenant slot, including all of create/attach/detach/delete" - ) - .expect("failed to define a metric"), - unexpected_errors: register_int_counter!( - "pageserver_tenant_manager_unexpected_errors_total", - "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug." - ) - .expect("failed to define a metric"), -} + .expect("failed to define a metric"); + TenantManagerMetrics { + tenant_slots_attached: tenant_slots + .get_metric_with_label_values(&["attached"]) + .unwrap(), + tenant_slots_secondary: tenant_slots + .get_metric_with_label_values(&["secondary"]) + .unwrap(), + tenant_slots_inprogress: tenant_slots + .get_metric_with_label_values(&["inprogress"]) + .unwrap(), + tenant_slot_writes: register_int_counter!( + "pageserver_tenant_manager_slot_writes", + "Writes to a tenant slot, including all of create/attach/detach/delete" + ) + .expect("failed to define a metric"), + unexpected_errors: register_int_counter!( + "pageserver_tenant_manager_unexpected_errors_total", + "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug." + ) + .expect("failed to define a metric"), + } }); pub(crate) struct DeletionQueueMetrics { @@ -1336,29 +1661,6 @@ pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { } }); -pub(crate) struct WalIngestMetrics { - pub(crate) records_received: IntCounter, - pub(crate) records_committed: IntCounter, - pub(crate) records_filtered: IntCounter, -} - -pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { - records_received: register_int_counter!( - "pageserver_wal_ingest_records_received", - "Number of WAL records received from safekeepers" - ) - .expect("failed to define a metric"), - records_committed: register_int_counter!( - "pageserver_wal_ingest_records_committed", - "Number of WAL records which resulted in writes to pageserver storage" - ) - .expect("failed to define a metric"), - records_filtered: register_int_counter!( - "pageserver_wal_ingest_records_filtered", - "Number of WAL records filtered out due to sharding" - ) - .expect("failed to define a metric"), -}); pub(crate) struct SecondaryModeMetrics { pub(crate) upload_heatmap: IntCounter, pub(crate) upload_heatmap_errors: IntCounter, @@ -1366,7 +1668,8 @@ pub(crate) struct SecondaryModeMetrics { pub(crate) download_heatmap: IntCounter, pub(crate) download_layer: IntCounter, } -pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| SecondaryModeMetrics { +pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { + SecondaryModeMetrics { upload_heatmap: register_int_counter!( "pageserver_secondary_upload_heatmap", "Number of heatmaps written to remote storage by attached tenants" @@ -1384,7 +1687,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco .expect("failed to define a metric"), download_heatmap: register_int_counter!( "pageserver_secondary_download_heatmap", - "Number of downloads of heatmaps by secondary mode locations" + "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed" ) .expect("failed to define a metric"), download_layer: register_int_counter!( @@ -1392,6 +1695,7 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| Seco "Number of downloads of layers by secondary mode locations" ) .expect("failed to define a metric"), +} }); #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -1558,6 +1862,36 @@ macro_rules! redo_bytes_histogram_count_buckets { }; } +pub(crate) struct WalIngestMetrics { + pub(crate) bytes_received: IntCounter, + pub(crate) records_received: IntCounter, + pub(crate) records_committed: IntCounter, + pub(crate) records_filtered: IntCounter, +} + +pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { + bytes_received: register_int_counter!( + "pageserver_wal_ingest_bytes_received", + "Bytes of WAL ingested from safekeepers", + ) + .unwrap(), + records_received: register_int_counter!( + "pageserver_wal_ingest_records_received", + "Number of WAL records received from safekeepers" + ) + .expect("failed to define a metric"), + records_committed: register_int_counter!( + "pageserver_wal_ingest_records_committed", + "Number of WAL records which resulted in writes to pageserver storage" + ) + .expect("failed to define a metric"), + records_filtered: register_int_counter!( + "pageserver_wal_ingest_records_filtered", + "Number of WAL records filtered out due to sharding" + ) + .expect("failed to define a metric"), +}); + pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_seconds", @@ -1594,11 +1928,18 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { .unwrap() }); +#[rustfmt::skip] pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_process_launch_duration", "Histogram of the duration of successful WalRedoProcess::launch calls", - redo_histogram_time_buckets!(), + vec![ + 0.0002, 0.0004, 0.0006, 0.0008, 0.0010, + 0.0020, 0.0040, 0.0060, 0.0080, 0.0100, + 0.0200, 0.0400, 0.0600, 0.0800, 0.1000, + 0.2000, 0.4000, 0.6000, 0.8000, 1.0000, + 1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000 + ], ) .expect("failed to define a metric") }); @@ -1681,6 +2022,22 @@ impl StorageTimeMetricsTimer { self.metrics.timeline_count.inc(); self.metrics.global_histogram.observe(duration); } + + /// Turns this timer into a timer, which will always record -- usually this means recording + /// regardless an early `?` path was taken in a function. + pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer { + AlwaysRecordingStorageTimeMetricsTimer(Some(self)) + } +} + +pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option); + +impl Drop for AlwaysRecordingStorageTimeMetricsTimer { + fn drop(&mut self) { + if let Some(inner) = self.0.take() { + inner.stop_and_record(); + } + } } /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and @@ -1696,14 +2053,19 @@ pub(crate) struct StorageTimeMetrics { } impl StorageTimeMetrics { - pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self { + pub fn new( + operation: StorageTimeOperation, + tenant_id: &str, + shard_id: &str, + timeline_id: &str, + ) -> Self { let operation: &'static str = operation.into(); let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE - .get_metric_with_label_values(&[operation, tenant_id, timeline_id]) + .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) .unwrap(); let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE - .get_metric_with_label_values(&[operation, tenant_id, timeline_id]) + .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) .unwrap(); let global_histogram = STORAGE_TIME_GLOBAL .get_metric_with_label_values(&[operation]) @@ -1736,59 +2098,110 @@ pub(crate) struct TimelineMetrics { pub imitate_logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, + pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, - resident_physical_size_gauge: UIntGauge, + pub standby_horizon_gauge: IntGauge, + pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, - pub num_persistent_files_created: IntCounter, - pub persistent_bytes_written: IntCounter, + pub aux_file_size_gauge: IntGauge, + pub directory_entries_count_gauge: Lazy UIntGauge>>, pub evictions: IntCounter, pub evictions_with_low_residence_duration: std::sync::RwLock, + shutdown: std::sync::atomic::AtomicBool, } impl TimelineMetrics { pub fn new( tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, + timeline_id_raw: &TimelineId, evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, ) -> Self { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_id = format!("{}", tenant_shard_id.shard_slug()); - let timeline_id = timeline_id.to_string(); - let flush_time_histo = - StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id); - let compact_time_histo = - StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id); - let create_images_time_histo = - StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id); - let logical_size_histo = - StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id); + let timeline_id = timeline_id_raw.to_string(); + let flush_time_histo = StorageTimeMetrics::new( + StorageTimeOperation::LayerFlush, + &tenant_id, + &shard_id, + &timeline_id, + ); + let compact_time_histo = StorageTimeMetrics::new( + StorageTimeOperation::Compact, + &tenant_id, + &shard_id, + &timeline_id, + ); + let create_images_time_histo = StorageTimeMetrics::new( + StorageTimeOperation::CreateImages, + &tenant_id, + &shard_id, + &timeline_id, + ); + let logical_size_histo = StorageTimeMetrics::new( + StorageTimeOperation::LogicalSize, + &tenant_id, + &shard_id, + &timeline_id, + ); let imitate_logical_size_histo = StorageTimeMetrics::new( StorageTimeOperation::ImitateLogicalSize, &tenant_id, + &shard_id, + &timeline_id, + ); + let load_layer_map_histo = StorageTimeMetrics::new( + StorageTimeOperation::LoadLayerMap, + &tenant_id, + &shard_id, + &timeline_id, + ); + let garbage_collect_histo = StorageTimeMetrics::new( + StorageTimeOperation::Gc, + &tenant_id, + &shard_id, + &timeline_id, + ); + let find_gc_cutoffs_histo = StorageTimeMetrics::new( + StorageTimeOperation::FindGcCutoffs, + &tenant_id, + &shard_id, &timeline_id, ); - let load_layer_map_histo = - StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id); - let garbage_collect_histo = - StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id); let last_record_gauge = LAST_RECORD_LSN - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + // TODO: we shouldn't expose this metric let current_logical_size_gauge = CURRENT_LOGICAL_SIZE - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); - let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); - let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + let aux_file_size_gauge = AUX_FILE_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + // TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065 + let directory_entries_count_gauge_closure = { + let tenant_shard_id = *tenant_shard_id; + let timeline_id_raw = *timeline_id_raw; + move || { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", tenant_shard_id.shard_slug()); + let timeline_id = timeline_id_raw.to_string(); + let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + gauge + } + }; + let directory_entries_count_gauge: Lazy UIntGauge>> = + Lazy::new(Box::new(directory_entries_count_gauge_closure)); let evictions = EVICTIONS - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder .build(&tenant_id, &shard_id, &timeline_id); @@ -1803,23 +2216,24 @@ impl TimelineMetrics { logical_size_histo, imitate_logical_size_histo, garbage_collect_histo, + find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, + standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, - num_persistent_files_created, - persistent_bytes_written, + aux_file_size_gauge, + directory_entries_count_gauge, evictions, evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), + shutdown: std::sync::atomic::AtomicBool::default(), } } pub(crate) fn record_new_file_metrics(&self, sz: u64) { self.resident_physical_size_add(sz); - self.num_persistent_files_created.inc_by(1); - self.persistent_bytes_written.inc_by(sz); } pub(crate) fn resident_physical_size_sub(&self, sz: u64) { @@ -1835,22 +2249,34 @@ impl TimelineMetrics { pub(crate) fn resident_physical_size_get(&self) -> u64 { self.resident_physical_size_gauge.get() } -} -impl Drop for TimelineMetrics { - fn drop(&mut self) { + pub(crate) fn shutdown(&self) { + let was_shutdown = self + .shutdown + .swap(true, std::sync::atomic::Ordering::Relaxed); + + if was_shutdown { + // this happens on tenant deletion because tenant first shuts down timelines, then + // invokes timeline deletion which first shuts down the timeline again. + // TODO: this can be removed once https://github.com/neondatabase/neon/issues/5080 + return; + } + let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let shard_id = &self.shard_id; - let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); + let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); - let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } - let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); - let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); - let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); - let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); + let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { + let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); + } + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); self.evictions_with_low_residence_duration .write() @@ -1863,117 +2289,125 @@ impl Drop for TimelineMetrics { // outlive an individual smgr connection, but not the timeline. for op in StorageTimeOperation::VARIANTS { - let _ = - STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]); - let _ = - STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]); + let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[ + op, + tenant_id, + shard_id, + timeline_id, + ]); + let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[ + op, + tenant_id, + shard_id, + timeline_id, + ]); } for op in STORAGE_IO_SIZE_OPERATIONS { - let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]); + let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } for op in SmgrQueryType::iter() { let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ op.into(), tenant_id, + shard_id, timeline_id, ]); } } } -pub fn remove_tenant_metrics(tenant_id: &TenantId) { - let tid = tenant_id.to_string(); - let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); +pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) { + // Only shard zero deals in synthetic sizes + if tenant_shard_id.is_shard_zero() { + let tid = tenant_shard_id.tenant_id.to_string(); + let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]); + } + // we leave the BROKEN_TENANTS_SET entry if any } use futures::Future; use pin_project_lite::pin_project; use std::collections::HashMap; +use std::num::NonZeroUsize; use std::pin::Pin; +use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; use crate::context::{PageContentKind, RequestContext}; use crate::task_mgr::TaskKind; +use crate::tenant::mgr::TenantSlot; /// Maintain a per timeline gauge in addition to the global gauge. -struct PerTimelineRemotePhysicalSizeGauge { - last_set: u64, +pub(crate) struct PerTimelineRemotePhysicalSizeGauge { + last_set: AtomicU64, gauge: UIntGauge, } impl PerTimelineRemotePhysicalSizeGauge { fn new(per_timeline_gauge: UIntGauge) -> Self { Self { - last_set: per_timeline_gauge.get(), + last_set: AtomicU64::new(0), gauge: per_timeline_gauge, } } - fn set(&mut self, sz: u64) { + pub(crate) fn set(&self, sz: u64) { self.gauge.set(sz); - if sz < self.last_set { - REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz); + let prev = self.last_set.swap(sz, std::sync::atomic::Ordering::Relaxed); + if sz < prev { + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(prev - sz); } else { - REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set); + REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - prev); }; - self.last_set = sz; } - fn get(&self) -> u64 { + pub(crate) fn get(&self) -> u64 { self.gauge.get() } } impl Drop for PerTimelineRemotePhysicalSizeGauge { fn drop(&mut self) { - REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set); + REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set.load(std::sync::atomic::Ordering::Relaxed)); } } pub(crate) struct RemoteTimelineClientMetrics { tenant_id: String, + shard_id: String, timeline_id: String, - remote_physical_size_gauge: Mutex>, - calls_unfinished_gauge: Mutex>, + pub(crate) remote_physical_size_gauge: PerTimelineRemotePhysicalSizeGauge, + calls: Mutex>, bytes_started_counter: Mutex>, bytes_finished_counter: Mutex>, } impl RemoteTimelineClientMetrics { pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { + let tenant_id_str = tenant_shard_id.tenant_id.to_string(); + let shard_id_str = format!("{}", tenant_shard_id.shard_slug()); + let timeline_id_str = timeline_id.to_string(); + + let remote_physical_size_gauge = PerTimelineRemotePhysicalSizeGauge::new( + REMOTE_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str]) + .unwrap(), + ); + RemoteTimelineClientMetrics { - tenant_id: tenant_shard_id.tenant_id.to_string(), - timeline_id: timeline_id.to_string(), - calls_unfinished_gauge: Mutex::new(HashMap::default()), + tenant_id: tenant_id_str, + shard_id: shard_id_str, + timeline_id: timeline_id_str, + calls: Mutex::new(HashMap::default()), bytes_started_counter: Mutex::new(HashMap::default()), bytes_finished_counter: Mutex::new(HashMap::default()), - remote_physical_size_gauge: Mutex::new(None), + remote_physical_size_gauge, } } - pub(crate) fn remote_physical_size_set(&self, sz: u64) { - let mut guard = self.remote_physical_size_gauge.lock().unwrap(); - let gauge = guard.get_or_insert_with(|| { - PerTimelineRemotePhysicalSizeGauge::new( - REMOTE_PHYSICAL_SIZE - .get_metric_with_label_values(&[ - &self.tenant_id.to_string(), - &self.timeline_id.to_string(), - ]) - .unwrap(), - ) - }); - gauge.set(sz); - } - - pub(crate) fn remote_physical_size_get(&self) -> u64 { - let guard = self.remote_physical_size_gauge.lock().unwrap(); - guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0) - } - pub fn remote_operation_time( &self, file_kind: &RemoteOpFileKind, @@ -1986,18 +2420,19 @@ impl RemoteTimelineClientMetrics { .unwrap() } - fn calls_unfinished_gauge( + fn calls_counter_pair( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, - ) -> IntGauge { - let mut guard = self.calls_unfinished_gauge.lock().unwrap(); + ) -> IntCounterPair { + let mut guard = self.calls.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); let metric = guard.entry(key).or_insert_with(move || { - REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE + REMOTE_TIMELINE_CLIENT_CALLS .get_metric_with_label_values(&[ - &self.tenant_id.to_string(), - &self.timeline_id.to_string(), + &self.tenant_id, + &self.shard_id, + &self.timeline_id, key.0, key.1, ]) @@ -2006,17 +2441,6 @@ impl RemoteTimelineClientMetrics { metric.clone() } - fn calls_started_hist( - &self, - file_kind: &RemoteOpFileKind, - op_kind: &RemoteOpKind, - ) -> Histogram { - let key = (file_kind.as_str(), op_kind.as_str()); - REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST - .get_metric_with_label_values(&[key.0, key.1]) - .unwrap() - } - fn bytes_started_counter( &self, file_kind: &RemoteOpFileKind, @@ -2027,8 +2451,9 @@ impl RemoteTimelineClientMetrics { let metric = guard.entry(key).or_insert_with(move || { REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER .get_metric_with_label_values(&[ - &self.tenant_id.to_string(), - &self.timeline_id.to_string(), + &self.tenant_id, + &self.shard_id, + &self.timeline_id, key.0, key.1, ]) @@ -2047,8 +2472,9 @@ impl RemoteTimelineClientMetrics { let metric = guard.entry(key).or_insert_with(move || { REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER .get_metric_with_label_values(&[ - &self.tenant_id.to_string(), - &self.timeline_id.to_string(), + &self.tenant_id, + &self.shard_id, + &self.timeline_id, key.0, key.1, ]) @@ -2085,7 +2511,7 @@ impl RemoteTimelineClientMetrics { #[must_use] pub(crate) struct RemoteTimelineClientCallMetricGuard { /// Decremented on drop. - calls_unfinished_metric: Option, + calls_counter_pair: Option, /// If Some(), this references the bytes_finished metric, and we increment it by the given `u64` on drop. bytes_finished: Option<(IntCounter, u64)>, } @@ -2095,10 +2521,10 @@ impl RemoteTimelineClientCallMetricGuard { /// The caller vouches to do the metric updates manually. pub fn will_decrement_manually(mut self) { let RemoteTimelineClientCallMetricGuard { - calls_unfinished_metric, + calls_counter_pair, bytes_finished, } = &mut self; - calls_unfinished_metric.take(); + calls_counter_pair.take(); bytes_finished.take(); } } @@ -2106,10 +2532,10 @@ impl RemoteTimelineClientCallMetricGuard { impl Drop for RemoteTimelineClientCallMetricGuard { fn drop(&mut self) { let RemoteTimelineClientCallMetricGuard { - calls_unfinished_metric, + calls_counter_pair, bytes_finished, } = self; - if let Some(guard) = calls_unfinished_metric.take() { + if let Some(guard) = calls_counter_pair.take() { guard.dec(); } if let Some((bytes_finished_metric, value)) = bytes_finished { @@ -2142,10 +2568,8 @@ impl RemoteTimelineClientMetrics { op_kind: &RemoteOpKind, size: RemoteTimelineClientMetricsCallTrackSize, ) -> RemoteTimelineClientCallMetricGuard { - let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); - self.calls_started_hist(file_kind, op_kind) - .observe(calls_unfinished_metric.get() as f64); - calls_unfinished_metric.inc(); // NB: inc after the histogram, see comment on underlying metric + let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind); + calls_counter_pair.inc(); let bytes_finished = match size { RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => { @@ -2159,7 +2583,7 @@ impl RemoteTimelineClientMetrics { } }; RemoteTimelineClientCallMetricGuard { - calls_unfinished_metric: Some(calls_unfinished_metric), + calls_counter_pair: Some(calls_counter_pair), bytes_finished, } } @@ -2173,12 +2597,8 @@ impl RemoteTimelineClientMetrics { op_kind: &RemoteOpKind, size: RemoteTimelineClientMetricsCallTrackSize, ) { - let calls_unfinished_metric = self.calls_unfinished_gauge(file_kind, op_kind); - debug_assert!( - calls_unfinished_metric.get() > 0, - "begin and end should cancel out" - ); - calls_unfinished_metric.dec(); + let calls_counter_pair = self.calls_counter_pair(file_kind, op_kind); + calls_counter_pair.dec(); match size { RemoteTimelineClientMetricsCallTrackSize::DontTrackSize { reason: _reason } => {} RemoteTimelineClientMetricsCallTrackSize::Bytes(size) => { @@ -2192,23 +2612,23 @@ impl Drop for RemoteTimelineClientMetrics { fn drop(&mut self) { let RemoteTimelineClientMetrics { tenant_id, + shard_id, timeline_id, remote_physical_size_gauge, - calls_unfinished_gauge, + calls, bytes_started_counter, bytes_finished_counter, } = self; - for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() { - let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[ - tenant_id, - timeline_id, - a, - b, - ]); + for ((a, b), _) in calls.get_mut().unwrap().drain() { + let mut res = [Ok(()), Ok(())]; + REMOTE_TIMELINE_CLIENT_CALLS + .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id, a, b]); + // don't care about results } for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() { let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[ tenant_id, + shard_id, timeline_id, a, b, @@ -2217,6 +2637,7 @@ impl Drop for RemoteTimelineClientMetrics { for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() { let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[ tenant_id, + shard_id, timeline_id, a, b, @@ -2224,7 +2645,7 @@ impl Drop for RemoteTimelineClientMetrics { } { let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above - let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); + let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } } } @@ -2234,8 +2655,6 @@ impl Drop for RemoteTimelineClientMetrics { pub(crate) trait MeasureRemoteOp: Sized { fn measure_remote_op( self, - tenant_id: TenantId, - timeline_id: TimelineId, file_kind: RemoteOpFileKind, op: RemoteOpKind, metrics: Arc, @@ -2243,8 +2662,6 @@ pub(crate) trait MeasureRemoteOp: Sized { let start = Instant::now(); MeasuredRemoteOp { inner: self, - tenant_id, - timeline_id, file_kind, op, start, @@ -2260,8 +2677,6 @@ pin_project! { { #[pin] inner: F, - tenant_id: TenantId, - timeline_id: TimelineId, file_kind: RemoteOpFileKind, op: RemoteOpKind, start: Instant, @@ -2286,6 +2701,217 @@ impl>, O, E> Future for MeasuredRemoteOp { } } +pub mod tokio_epoll_uring { + use metrics::{register_int_counter, UIntGauge}; + use once_cell::sync::Lazy; + + pub struct Collector { + descs: Vec, + systems_created: UIntGauge, + systems_destroyed: UIntGauge, + } + + impl metrics::core::Collector for Collector { + fn desc(&self) -> Vec<&metrics::core::Desc> { + self.descs.iter().collect() + } + + fn collect(&self) -> Vec { + let mut mfs = Vec::with_capacity(Self::NMETRICS); + let tokio_epoll_uring::metrics::Metrics { + systems_created, + systems_destroyed, + } = tokio_epoll_uring::metrics::global(); + self.systems_created.set(systems_created); + mfs.extend(self.systems_created.collect()); + self.systems_destroyed.set(systems_destroyed); + mfs.extend(self.systems_destroyed.collect()); + mfs + } + } + + impl Collector { + const NMETRICS: usize = 2; + + #[allow(clippy::new_without_default)] + pub fn new() -> Self { + let mut descs = Vec::new(); + + let systems_created = UIntGauge::new( + "pageserver_tokio_epoll_uring_systems_created", + "counter of tokio-epoll-uring systems that were created", + ) + .unwrap(); + descs.extend( + metrics::core::Collector::desc(&systems_created) + .into_iter() + .cloned(), + ); + + let systems_destroyed = UIntGauge::new( + "pageserver_tokio_epoll_uring_systems_destroyed", + "counter of tokio-epoll-uring systems that were destroyed", + ) + .unwrap(); + descs.extend( + metrics::core::Collector::desc(&systems_destroyed) + .into_iter() + .cloned(), + ); + + Self { + descs, + systems_created, + systems_destroyed, + } + } + } + + pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count", + "Number of times where thread_local_system creation spanned multiple executor threads", + ) + .unwrap() + }); + + pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count", + "Number of times thread_local_system creation failed and was retried after back-off.", + ) + .unwrap() + }); +} + +pub(crate) mod tenant_throttling { + use metrics::{register_int_counter_vec, IntCounter}; + use once_cell::sync::Lazy; + + use crate::tenant::{self, throttle::Metric}; + + pub(crate) struct TimelineGet { + wait_time: IntCounter, + count: IntCounter, + } + + pub(crate) static TIMELINE_GET: Lazy = Lazy::new(|| { + static WAIT_USECS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_wait_usecs_sum_global", + "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.", + &["kind"] + ) + .unwrap() + }); + + static WAIT_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_tenant_throttling_count_global", + "Count of tenant throttlings, by kind of throttle.", + &["kind"] + ) + .unwrap() + }); + + let kind = "timeline_get"; + TimelineGet { + wait_time: WAIT_USECS.with_label_values(&[kind]), + count: WAIT_COUNT.with_label_values(&[kind]), + } + }); + + impl Metric for &'static TimelineGet { + #[inline(always)] + fn observe_throttling( + &self, + tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation, + ) { + let val = u64::try_from(wait_time.as_micros()).unwrap(); + self.wait_time.inc_by(val); + self.count.inc(); + } + } +} + +pub(crate) mod disk_usage_based_eviction { + use super::*; + + pub(crate) struct Metrics { + pub(crate) tenant_collection_time: Histogram, + pub(crate) tenant_layer_count: Histogram, + pub(crate) layers_collected: IntCounter, + pub(crate) layers_selected: IntCounter, + pub(crate) layers_evicted: IntCounter, + } + + impl Default for Metrics { + fn default() -> Self { + let tenant_collection_time = register_histogram!( + "pageserver_disk_usage_based_eviction_tenant_collection_seconds", + "Time spent collecting layers from a tenant -- not normalized by collected layer amount", + vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0] + ) + .unwrap(); + + let tenant_layer_count = register_histogram!( + "pageserver_disk_usage_based_eviction_tenant_collected_layers", + "Amount of layers gathered from a tenant", + vec![5.0, 50.0, 500.0, 5000.0, 50000.0] + ) + .unwrap(); + + let layers_collected = register_int_counter!( + "pageserver_disk_usage_based_eviction_collected_layers_total", + "Amount of layers collected" + ) + .unwrap(); + + let layers_selected = register_int_counter!( + "pageserver_disk_usage_based_eviction_select_layers_total", + "Amount of layers selected" + ) + .unwrap(); + + let layers_evicted = register_int_counter!( + "pageserver_disk_usage_based_eviction_evicted_layers_total", + "Amount of layers successfully evicted" + ) + .unwrap(); + + Self { + tenant_collection_time, + tenant_layer_count, + layers_collected, + layers_selected, + layers_evicted, + } + } + } + + pub(crate) static METRICS: Lazy = Lazy::new(Metrics::default); +} + +static TOKIO_EXECUTOR_THREAD_COUNT: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_tokio_executor_thread_configured_count", + "Total number of configued tokio executor threads in the process. + The `setup` label denotes whether we're running with multiple runtimes or a single runtime.", + &["setup"], + ) + .unwrap() +}); + +pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { + static SERIALIZE: std::sync::Mutex<()> = std::sync::Mutex::new(()); + let _guard = SERIALIZE.lock().unwrap(); + TOKIO_EXECUTOR_THREAD_COUNT.reset(); + TOKIO_EXECUTOR_THREAD_COUNT + .get_metric_with_label_values(&[setup]) + .unwrap() + .set(u64::try_from(num_threads.get()).unwrap()); +} + pub fn preinitialize_metrics() { // Python tests need these and on some we do alerting. // @@ -2304,6 +2930,10 @@ pub fn preinitialize_metrics() { &WALRECEIVER_BROKER_UPDATES, &WALRECEIVER_CANDIDATES_ADDED, &WALRECEIVER_CANDIDATES_REMOVED, + &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES, + &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES, + &REMOTE_ONDEMAND_DOWNLOADED_LAYERS, + &REMOTE_ONDEMAND_DOWNLOADED_BYTES, ] .into_iter() .for_each(|c| { @@ -2320,6 +2950,13 @@ pub fn preinitialize_metrics() { Lazy::force(&TENANT_MANAGER); Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS); + Lazy::force(&disk_usage_based_eviction::METRICS); + + for state_name in pageserver_api::models::TenantState::VARIANTS { + // initialize the metric for all gauges, otherwise the time series might seemingly show + // values from last restart. + TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0); + } // countervecs [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] @@ -2333,7 +2970,8 @@ pub fn preinitialize_metrics() { // histograms [ - &READ_NUM_FS_LAYERS, + &READ_NUM_LAYERS_VISITED, + &VEC_READ_NUM_LAYERS_VISITED, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, @@ -2347,4 +2985,5 @@ pub fn preinitialize_metrics() { // Custom Lazy::force(&RECONSTRUCT_TIME); + Lazy::force(&tenant_throttling::TIMELINE_GET); } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index c3c98af406..529fb9bb07 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -73,7 +73,6 @@ use std::{ collections::{hash_map::Entry, HashMap}, - convert::TryInto, sync::{ atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering}, Arc, Weak, @@ -262,7 +261,9 @@ pub struct PageCache { size_metrics: &'static PageCacheSizeMetrics, } -struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit); +struct PinnedSlotsPermit { + _permit: tokio::sync::OwnedSemaphorePermit, +} /// /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked @@ -550,7 +551,6 @@ impl PageCache { // not require changes. async fn try_get_pinned_slot_permit(&self) -> anyhow::Result { - let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer(); match tokio::time::timeout( // Choose small timeout, neon_smgr does its own retries. // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869 @@ -559,11 +559,10 @@ impl PageCache { ) .await { - Ok(res) => Ok(PinnedSlotsPermit( - res.expect("this semaphore is never closed"), - )), + Ok(res) => Ok(PinnedSlotsPermit { + _permit: res.expect("this semaphore is never closed"), + }), Err(_timeout) => { - timer.stop_and_discard(); crate::metrics::page_cache_errors_inc( crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout, ); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 291490d016..ebc23e8945 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1,31 +1,31 @@ -// //! The Page Service listens for client connections and serves their GetPage@LSN //! requests. -// -// It is possible to connect here using usual psql/pgbench/libpq. Following -// commands are supported now: -// *status* -- show actual info about this pageserver, -// *pagestream* -- enter mode where smgr and pageserver talk with their -// custom protocol. -// use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; use bytes::Bytes; +use futures::stream::FuturesUnordered; use futures::Stream; +use futures::StreamExt; +use pageserver_api::key::Key; use pageserver_api::models::TenantState; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse, - PagestreamNblocksRequest, PagestreamNblocksResponse, + PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest, + PagestreamNblocksResponse, PagestreamProtocolVersion, }; -use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError}; +use pageserver_api::shard::ShardIndex; +use pageserver_api::shard::ShardNumber; +use pageserver_api::shard::TenantShardId; +use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::borrow::Cow; +use std::collections::HashMap; use std::io; use std::net::TcpListener; use std::pin::pin; @@ -33,13 +33,15 @@ use std::str; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; +use std::time::Instant; +use std::time::SystemTime; use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; -use tracing::field; use tracing::*; use utils::id::ConnectionId; +use utils::sync::gate::GateGuard; use utils::{ auth::{Claims, Scope, SwappableJwtAuth}, id::{TenantId, TimelineId}, @@ -49,25 +51,30 @@ use utils::{ use crate::auth::check_permission; use crate::basebackup; -use crate::config::PageServerConf; +use crate::basebackup::BasebackupError; use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; use crate::metrics::LIVE_CONNECTIONS_COUNT; -use crate::pgdatadir_mapping::{rel_block_to_key, Version}; +use crate::pgdatadir_mapping::Version; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::task_mgr; use crate::task_mgr::TaskKind; -use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::mgr; -use crate::tenant::mgr::get_active_tenant_with_timeout; use crate::tenant::mgr::GetActiveTenantError; +use crate::tenant::mgr::GetTenantError; +use crate::tenant::mgr::ShardResolveResult; use crate::tenant::mgr::ShardSelector; +use crate::tenant::mgr::TenantManager; +use crate::tenant::timeline::FlushLayerError; use crate::tenant::timeline::WaitLsnError; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; +use crate::tenant::Tenant; use crate::tenant::Timeline; use crate::trace::Tracer; - +use pageserver_api::key::rel_block_to_key; +use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; @@ -81,8 +88,8 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); /// `tokio_tar` already read the first such block. Read the second all-zeros block, /// and check that there is no more data after the EOF marker. /// -/// XXX: Currently, any trailing data after the EOF marker prints a warning. -/// Perhaps it should be a hard error? +/// 'tar' command can also write extra blocks of zeros, up to a record +/// size, controlled by the --record-size argument. Ignore them too. async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> { use tokio::io::AsyncReadExt; let mut buf = [0u8; 512]; @@ -103,17 +110,24 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<() anyhow::bail!("invalid tar EOF marker"); } - // Drain any data after the EOF marker + // Drain any extra zero-blocks after the EOF marker let mut trailing_bytes = 0; + let mut seen_nonzero_bytes = false; loop { let nbytes = reader.read(&mut buf).await?; trailing_bytes += nbytes; + if !buf.iter().all(|&x| x == 0) { + seen_nonzero_bytes = true; + } if nbytes == 0 { break; } } - if trailing_bytes > 0 { - warn!("ignored {trailing_bytes} unexpected bytes after the tar archive"); + if seen_nonzero_bytes { + anyhow::bail!("unexpected non-zero bytes after the tar archive"); + } + if trailing_bytes % 512 != 0 { + anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); } Ok(()) } @@ -126,7 +140,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<() /// Listens for connections, and launches a new handler task for each. /// pub async fn libpq_listener_main( - conf: &'static PageServerConf, + tenant_manager: Arc, broker_client: storage_broker::BrokerClientChannel, auth: Option>, listener: TcpListener, @@ -171,7 +185,7 @@ pub async fn libpq_listener_main( "serving compute connection task", false, page_service_conn_main( - conf, + tenant_manager.clone(), broker_client.clone(), local_auth, socket, @@ -194,7 +208,7 @@ pub async fn libpq_listener_main( #[instrument(skip_all, fields(peer_addr))] async fn page_service_conn_main( - conf: &'static PageServerConf, + tenant_manager: Arc, broker_client: storage_broker::BrokerClientChannel, auth: Option>, socket: tokio::net::TcpStream, @@ -247,11 +261,14 @@ async fn page_service_conn_main( socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms))); let socket = std::pin::pin!(socket); + fail::fail_point!("ps::connection-start::pre-login"); + // XXX: pgbackend.run() should take the connection_ctx, // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. - let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx); + let mut conn_handler = + PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; match pgbackend @@ -274,17 +291,33 @@ async fn page_service_conn_main( } } +/// While a handler holds a reference to a Timeline, it also holds a the +/// timeline's Gate open. +struct HandlerTimeline { + timeline: Arc, + _guard: GateGuard, +} + struct PageServerHandler { - _conf: &'static PageServerConf, broker_client: storage_broker::BrokerClientChannel, auth: Option>, claims: Option, + tenant_manager: Arc, + /// The context created for the lifetime of the connection /// services by this PageServerHandler. /// For each query received over the connection, /// `process_query` creates a child context from this one. connection_ctx: RequestContext, + + /// See [`Self::cache_timeline`] for usage. + /// + /// Note on size: the typical size of this map is 1. The largest size we expect + /// to see is the number of shards divided by the number of pageservers (typically < 2), + /// or the ratio used when splitting shards (i.e. how many children created from one) + /// parent shard, where a "large" number might be ~8. + shard_timelines: HashMap, } #[derive(thiserror::Error, Debug)] @@ -299,8 +332,8 @@ enum PageStreamError { Shutdown, /// Something went wrong reading a page: this likely indicates a pageserver bug - #[error("Read error: {0}")] - Read(PageReconstructError), + #[error("Read error")] + Read(#[source] PageReconstructError), /// Ran out of time waiting for an LSN #[error("LSN timeout: {0}")] @@ -309,11 +342,11 @@ enum PageStreamError { /// The entity required to serve the request (tenant or timeline) is not found, /// or is not found in a suitable state to serve a request. #[error("Not found: {0}")] - NotFound(std::borrow::Cow<'static, str>), + NotFound(Cow<'static, str>), /// Request asked for something that doesn't make sense, like an invalid LSN #[error("Bad request: {0}")] - BadRequest(std::borrow::Cow<'static, str>), + BadRequest(Cow<'static, str>), } impl From for PageStreamError { @@ -340,31 +373,92 @@ impl From for PageStreamError { match value { e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e), WaitLsnError::Shutdown => Self::Shutdown, - WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()), + e @ WaitLsnError::BadState { .. } => Self::Reconnect(format!("{e}").into()), + } + } +} + +impl From for QueryError { + fn from(value: WaitLsnError) -> Self { + match value { + e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)), + WaitLsnError::Shutdown => Self::Shutdown, + WaitLsnError::BadState { .. } => Self::Reconnect, } } } impl PageServerHandler { pub fn new( - conf: &'static PageServerConf, + tenant_manager: Arc, broker_client: storage_broker::BrokerClientChannel, auth: Option>, connection_ctx: RequestContext, ) -> Self { PageServerHandler { - _conf: conf, + tenant_manager, broker_client, auth, claims: None, connection_ctx, + shard_timelines: HashMap::new(), } } - /// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use - /// this rather than naked flush() in order to shut down promptly. Without this, we would - /// block shutdown of a tenant if a postgres client was failing to consume bytes we send - /// in the flush. + /// Future that completes when we need to shut down the connection. + /// + /// We currently need to shut down when any of the following happens: + /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled + /// 2. task_mgr requests shutdown of the connection + /// + /// NB on (1): the connection's lifecycle is not actually tied to any of the + /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current + /// implementation to be responsive to timeline cancellation because + /// the connection holds their `GateGuards` open (sored in `shard_timelines`). + /// We currently do the easy thing and terminate the connection if any of the + /// shard_timelines gets cancelled. But really, we cuold spend more effort + /// and simply remove the cancelled timeline from the `shard_timelines`, thereby + /// dropping the guard. + /// + /// NB: keep in sync with [`Self::is_connection_cancelled`] + async fn await_connection_cancelled(&self) { + // A short wait before we expend the cycles to walk our timeline map. This avoids incurring + // that cost every time we check for cancellation. + tokio::time::sleep(Duration::from_millis(10)).await; + + // This function is never called concurrently with code that adds timelines to shard_timelines, + // which is enforced by the borrow checker (the future returned by this function carries the + // immutable &self). So it's fine to evaluate shard_timelines after the sleep, we don't risk + // missing any inserts to the map. + + let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len()); + use futures::future::Either; + cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher())); + cancellation_sources.extend( + self.shard_timelines + .values() + .map(|ht| Either::Right(ht.timeline.cancel.cancelled())), + ); + FuturesUnordered::from_iter(cancellation_sources) + .next() + .await; + } + + /// Checking variant of [`Self::await_connection_cancelled`]. + fn is_connection_cancelled(&self) -> bool { + task_mgr::is_shutdown_requested() + || self + .shard_timelines + .values() + .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping()) + } + + /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`. Pass in + /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect + /// cancellation if there aren't any timelines in the cache. + /// + /// If calling from a function that doesn't use the `[Self::shard_timelines]` cache, then pass in the + /// timeline cancellation token. async fn flush_cancellable( &self, pgb: &mut PostgresBackend, @@ -377,6 +471,9 @@ impl PageServerHandler { flush_r = pgb.flush() => { Ok(flush_r?) }, + _ = self.await_connection_cancelled() => { + Err(QueryError::Shutdown) + } _ = cancel.cancelled() => { Err(QueryError::Shutdown) } @@ -452,28 +549,21 @@ impl PageServerHandler { #[instrument(skip_all)] async fn handle_pagerequests( - &self, + &mut self, pgb: &mut PostgresBackend, tenant_id: TenantId, timeline_id: TimelineId, + protocol_version: PagestreamProtocolVersion, ctx: RequestContext, ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); - // Note that since one connection may contain getpage requests that target different - // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant - // that we look up here may not be the one that serves all the actual requests: we will double - // check the mapping of key->shard later before calling into Timeline for getpage requests. - let tenant = mgr::get_active_tenant_with_timeout( - tenant_id, - ShardSelector::First, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; + let tenant = self + .get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT) + .await?; // Make request tracer if needed let mut tracer = if tenant.get_trace_read_requests() { @@ -487,27 +577,15 @@ impl PageServerHandler { None }; - // Check that the timeline exists - let timeline = tenant - .get_timeline(timeline_id, true) - .map_err(|e| QueryError::NotFound(format!("{e}").into()))?; - - // Avoid starting new requests if the timeline has already started shutting down, - // and block timeline shutdown until this request is complete, or drops out due - // to cancellation. - let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?; - // switch client to COPYBOTH pgb.write_message_noflush(&BeMessage::CopyBothResponse)?; - self.flush_cancellable(pgb, &timeline.cancel).await?; - - let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id); + self.flush_cancellable(pgb, &tenant.cancel).await?; loop { let msg = tokio::select! { biased; - _ = timeline.cancel.cancelled() => { + _ = self.await_connection_cancelled() => { // We were requested to shut down. info!("shutdown request received in page handler"); return Err(QueryError::Shutdown) @@ -528,53 +606,66 @@ impl PageServerHandler { }; trace!("query: {copy_data_bytes:?}"); + fail::fail_point!("ps::handle-pagerequest-message"); // Trace request if needed if let Some(t) = tracer.as_mut() { t.trace(©_data_bytes) } - let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + let neon_fe_msg = + PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?; // TODO: We could create a new per-request context here, with unique ID. // Currently we use the same per-timeline context for all requests let (response, span) = match neon_fe_msg { PagestreamFeMessage::Exists(req) => { - let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists); - let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn); + fail::fail_point!("ps::handle-pagerequest-message::exists"); + let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); ( - self.handle_get_rel_exists_request(&timeline, &req, &ctx) + self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) .await, span, ) } PagestreamFeMessage::Nblocks(req) => { - let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize); - let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn); + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); ( - self.handle_get_nblocks_request(&timeline, &req, &ctx) + self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) .await, span, ) } PagestreamFeMessage::GetPage(req) => { - let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn); - let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn); + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + // shard_id is filled in by the handler + let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn); ( - self.handle_get_page_at_lsn_request(&timeline, &req, &ctx) + self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) .await, span, ) } PagestreamFeMessage::DbSize(req) => { - let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize); - let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn); + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); ( - self.handle_db_size_request(&timeline, &req, &ctx) + self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) + .instrument(span.clone()) + .await, + span, + ) + } + PagestreamFeMessage::GetSlruSegment(req) => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); + ( + self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx) .instrument(span.clone()) .await, span, @@ -594,7 +685,7 @@ impl PageServerHandler { span.in_scope(|| info!("handler requested reconnect: {reason}")); return Err(QueryError::Reconnect); } - Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => { + Err(e) if self.is_connection_cancelled() => { // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean // shutdown error, this may be buried inside a PageReconstructError::Other for example. // @@ -610,14 +701,17 @@ impl PageServerHandler { // print the all details to the log with {:#}, but for the client the // error message is enough. Do not log if shutting down, as the anyhow::Error // here includes cancellation which is not an error. - span.in_scope(|| error!("error reading relation or page version: {:#}", e)); + let full = utils::error::report_compact_sources(&e); + span.in_scope(|| { + error!("error reading relation or page version: {full:#}") + }); PagestreamBeMessage::Error(PagestreamErrorResponse { message: e.to_string(), }) }); pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; - self.flush_cancellable(pgb, &timeline.cancel).await?; + self.flush_cancellable(pgb, &tenant.cancel).await?; } } } @@ -639,17 +733,13 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); // Create empty timeline info!("creating new timeline"); - let tenant = get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; + let tenant = self + .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT) + .await?; let timeline = tenant .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) .await?; @@ -672,6 +762,7 @@ impl PageServerHandler { let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel))); timeline .import_basebackup_from_tar( + tenant.clone(), &mut copyin_reader, base_lsn, self.broker_client.clone(), @@ -692,7 +783,7 @@ impl PageServerHandler { Ok(()) } - #[instrument(skip_all, fields(%start_lsn, %end_lsn))] + #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))] async fn handle_import_wal( &self, pgb: &mut PostgresBackend, @@ -705,8 +796,6 @@ impl PageServerHandler { where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); - let timeline = self .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) .await?; @@ -742,7 +831,10 @@ impl PageServerHandler { // We only want to persist the data, and it doesn't matter if it's in the // shape of deltas or images. info!("flushing layers"); - timeline.freeze_and_flush().await?; + timeline.freeze_and_flush().await.map_err(|e| match e { + FlushLayerError::Cancelled => QueryError::Shutdown, + other => QueryError::Other(other.into()), + })?; info!("done"); Ok(()) @@ -751,81 +843,140 @@ impl PageServerHandler { /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about - /// which version of the page is being requested. The client can request the - /// latest version of the page, or the version that's valid at a particular - /// LSN. The primary compute node will always request the latest page - /// version, while a standby will request a version at the LSN that it's - /// currently caught up to. + /// which version of the page is being requested. The primary compute node + /// will always request the latest page version, by setting 'request_lsn' to + /// the last inserted or flushed WAL position, while a standby will request + /// a version at the LSN that it's currently caught up to. /// /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. + /// + /// In addition to the request LSN, each request carries another LSN, + /// 'not_modified_since', which is a hint to the pageserver that the client + /// knows that the page has not been modified between 'not_modified_since' + /// and the request LSN. This allows skipping the wait, as long as the WAL + /// up to 'not_modified_since' has arrived. If the client doesn't have any + /// information about when the page was modified, it will use + /// not_modified_since == lsn. If the client lies and sends a too low + /// not_modified_hint such that there are in fact later page versions, the + /// behavior is undefined: the pageserver may return any of the page versions + /// or an error. async fn wait_or_get_last_lsn( timeline: &Timeline, - mut lsn: Lsn, - latest: bool, + request_lsn: Lsn, + not_modified_since: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, ctx: &RequestContext, ) -> Result { - if latest { - // Latest page version was requested. If LSN is given, it is a hint - // to the page server that there have been no modifications to the - // page after that LSN. If we haven't received WAL up to that point, - // wait until it arrives. - let last_record_lsn = timeline.get_last_record_lsn(); + let last_record_lsn = timeline.get_last_record_lsn(); - // Note: this covers the special case that lsn == Lsn(0). That - // special case means "return the latest version whatever it is", - // and it's used for bootstrapping purposes, when the page server is - // connected directly to the compute node. That is needed because - // when you connect to the compute node, to receive the WAL, the - // walsender process will do a look up in the pg_authid catalog - // table for authentication. That poses a deadlock problem: the - // catalog table lookup will send a GetPage request, but the GetPage - // request will block in the page server because the recent WAL - // hasn't been received yet, and it cannot be received until the - // walsender completes the authentication and starts streaming the - // WAL. - if lsn <= last_record_lsn { - lsn = last_record_lsn; + // Sanity check the request + if request_lsn < not_modified_since { + return Err(PageStreamError::BadRequest( + format!( + "invalid request with request LSN {} and not_modified_since {}", + request_lsn, not_modified_since, + ) + .into(), + )); + } + + if request_lsn < **latest_gc_cutoff_lsn { + // Check explicitly for INVALID just to get a less scary error message if the + // request is obviously bogus + return Err(if request_lsn == Lsn::INVALID { + PageStreamError::BadRequest("invalid LSN(0) in request".into()) } else { - timeline.wait_lsn(lsn, ctx).await?; - // Since we waited for 'lsn' to arrive, that is now the last - // record LSN. (Or close enough for our purposes; the - // last-record LSN can advance immediately after we return - // anyway) - } - } else { - if lsn == Lsn(0) { - return Err(PageStreamError::BadRequest( - "invalid LSN(0) in request".into(), - )); - } - timeline.wait_lsn(lsn, ctx).await?; + PageStreamError::BadRequest(format!( + "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", + request_lsn, **latest_gc_cutoff_lsn + ).into()) + }); } - if lsn < **latest_gc_cutoff_lsn { - return Err(PageStreamError::BadRequest(format!( - "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", - lsn, **latest_gc_cutoff_lsn - ).into())); + // Wait for WAL up to 'not_modified_since' to arrive, if necessary + if not_modified_since > last_record_lsn { + timeline + .wait_lsn( + not_modified_since, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; + // Since we waited for 'not_modified_since' to arrive, that is now the last + // record LSN. (Or close enough for our purposes; the last-record LSN can + // advance immediately after we return anyway) + Ok(not_modified_since) + } else { + // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn) + // here instead. That would give the same result, since we know that there + // haven't been any modifications since 'not_modified_since'. Using an older + // LSN might be faster, because that could allow skipping recent layers when + // finding the page. However, we have historically used 'last_record_lsn', so + // stick to that for now. + Ok(std::cmp::min(last_record_lsn, request_lsn)) } - Ok(lsn) } - async fn handle_get_rel_exists_request( + #[instrument(skip_all, fields(shard_id, %lsn))] + async fn handle_make_lsn_lease( &self, - timeline: &Timeline, + pgb: &mut PostgresBackend, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result<(), QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + { + let shard_selector = ShardSelector::Known(tenant_shard_id.to_index()); + let timeline = self + .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector) + .await?; + let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?; + let valid_until = lease + .valid_until + .duration_since(SystemTime::UNIX_EPOCH) + .map_err(|e| QueryError::Other(e.into()))?; + + pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( + b"valid_until", + )]))? + .write_message_noflush(&BeMessage::DataRow(&[Some( + &valid_until.as_millis().to_be_bytes(), + )]))? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + + Ok(()) + } + + #[instrument(skip_all, fields(shard_id))] + async fn handle_get_rel_exists_request( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { + let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + let _timer = timeline + .query_metrics + .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let exists = timeline - .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx) + .get_rel_exists(req.rel, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { @@ -833,19 +984,32 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] async fn handle_get_nblocks_request( - &self, - timeline: &Timeline, + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { + let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + + let _timer = timeline + .query_metrics + .start_timer(metrics::SmgrQueryType::GetRelSize, ctx); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let n_blocks = timeline - .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx) + .get_rel_size(req.rel, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { @@ -853,25 +1017,32 @@ impl PageServerHandler { })) } + #[instrument(skip_all, fields(shard_id))] async fn handle_db_size_request( - &self, - timeline: &Timeline, + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { + let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; + + let _timer = timeline + .query_metrics + .start_timer(metrics::SmgrQueryType::GetDbSize, ctx); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; let total_blocks = timeline - .get_db_size( - DEFAULTTABLESPACE_OID, - req.dbnode, - Version::Lsn(lsn), - req.latest, - ctx, - ) + .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx) .await?; let db_size = total_blocks as i64 * BLCKSZ as i64; @@ -880,18 +1051,175 @@ impl PageServerHandler { })) } - async fn do_handle_get_page_at_lsn_request( - &self, - timeline: &Timeline, + /// For most getpage requests, we will already have a Timeline to serve the request: this function + /// looks up such a Timeline synchronously and without touching any global state. + fn get_cached_timeline_for_page( + &mut self, + req: &PagestreamGetPageRequest, + ) -> Result<&Arc, Key> { + let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() { + // Fastest path: single sharded case + if first_idx.shard_count.count() == 1 { + return Ok(&first_timeline.timeline); + } + + let key = rel_block_to_key(req.rel, req.blkno); + let shard_num = first_timeline + .timeline + .get_shard_identity() + .get_shard_number(&key); + + // Fast path: matched the first timeline in our local handler map. This case is common if + // only one shard per tenant is attached to this pageserver. + if first_timeline.timeline.get_shard_identity().number == shard_num { + return Ok(&first_timeline.timeline); + } + + let shard_index = ShardIndex { + shard_number: shard_num, + shard_count: first_timeline.timeline.get_shard_identity().count, + }; + + // Fast-ish path: timeline is in the connection handler's local cache + if let Some(found) = self.shard_timelines.get(&shard_index) { + return Ok(&found.timeline); + } + + key + } else { + rel_block_to_key(req.rel, req.blkno) + }; + + Err(key) + } + + /// Having looked up the [`Timeline`] instance for a particular shard, cache it to enable + /// use in future requests without having to traverse [`crate::tenant::mgr::TenantManager`] + /// again. + /// + /// Note that all the Timelines in this cache are for the same timeline_id: they're differ + /// in which shard they belong to. When we serve a getpage@lsn request, we choose a shard + /// based on key. + /// + /// The typical size of this cache is 1, as we generally create shards to distribute work + /// across pageservers, so don't tend to have multiple shards for the same tenant on the + /// same pageserver. + fn cache_timeline( + &mut self, + timeline: Arc, + ) -> Result<&Arc, GetActiveTimelineError> { + let gate_guard = timeline + .gate + .enter() + .map_err(|_| GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled))?; + + let shard_index = timeline.tenant_shard_id.to_index(); + let entry = self + .shard_timelines + .entry(shard_index) + .or_insert(HandlerTimeline { + timeline, + _guard: gate_guard, + }); + + Ok(&entry.timeline) + } + + /// If [`Self::get_cached_timeline_for_page`] missed, then this function is used to populate the cache with + /// a Timeline to serve requests for this key, if such a Timeline is present on this pageserver. If no such + /// Timeline is found, then we will return an error (this indicates that the client is talking to the wrong node). + async fn load_timeline_for_page( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + key: Key, + ) -> anyhow::Result<&Arc, GetActiveTimelineError> { + // Slow path: we must call out to the TenantManager to find the timeline for this Key + let timeline = self + .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Page(key)) + .await?; + + self.cache_timeline(timeline) + } + + async fn get_timeline_shard_zero( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> anyhow::Result<&Arc, GetActiveTimelineError> { + // This is a borrow-checker workaround: we can't return from inside of the `if let Some` because + // that would be an immutable-borrow-self return, whereas later in the function we will use a mutable + // ref to salf. So instead, we first build a bool, and then return while not borrowing self. + let have_cached = if let Some((idx, _tl)) = self.shard_timelines.iter().next() { + idx.shard_number == ShardNumber(0) + } else { + false + }; + + if have_cached { + let entry = self.shard_timelines.iter().next().unwrap(); + Ok(&entry.1.timeline) + } else { + let timeline = self + .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) + .await?; + Ok(self.cache_timeline(timeline)?) + } + } + + #[instrument(skip_all, fields(shard_id))] + async fn handle_get_page_at_lsn_request( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, req: &PagestreamGetPageRequest, ctx: &RequestContext, ) -> Result { + let timeline = match self.get_cached_timeline_for_page(req) { + Ok(tl) => { + set_tracing_field_shard_id(tl); + tl + } + Err(key) => { + match self + .load_timeline_for_page(tenant_id, timeline_id, key) + .await + { + Ok(t) => t, + Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return Err(PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into(), + )); + } + Err(e) => return Err(e.into()), + } + } + }; + + let _timer = timeline + .query_metrics + .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx); + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); - let lsn = - Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx) - .await?; + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; + let page = timeline - .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx) + .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx) .await?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { @@ -899,62 +1227,45 @@ impl PageServerHandler { })) } - async fn handle_get_page_at_lsn_request( - &self, - timeline: &Timeline, - req: &PagestreamGetPageRequest, + #[instrument(skip_all, fields(shard_id))] + async fn handle_get_slru_segment_request( + &mut self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { - let key = rel_block_to_key(req.rel, req.blkno); - if timeline.get_shard_identity().is_key_local(&key) { - self.do_handle_get_page_at_lsn_request(timeline, req, ctx) - .await - } else { - // The Tenant shard we looked up at connection start does not hold this particular - // key: look for other shards in this tenant. This scenario occurs if a pageserver - // has multiple shards for the same tenant. - // - // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037) - let timeline = match self - .get_active_tenant_timeline( - timeline.tenant_shard_id.tenant_id, - timeline.timeline_id, - ShardSelector::Page(key), - ) - .await - { - Ok(t) => t, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}", - timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key); - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return Err(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into(), - )); - } - Err(e) => return Err(e.into()), - }; + let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?; - // Take a GateGuard for the duration of this request. If we were using our main Timeline object, - // the GateGuard was already held over the whole connection. - let _timeline_guard = timeline - .gate - .enter() - .map_err(|_| PageStreamError::Shutdown)?; + let _timer = timeline + .query_metrics + .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx); - self.do_handle_get_page_at_lsn_request(&timeline, req, ctx) - .await - } + let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let lsn = Self::wait_or_get_last_lsn( + timeline, + req.request_lsn, + req.not_modified_since, + &latest_gc_cutoff_lsn, + ctx, + ) + .await?; + + let kind = SlruKind::from_repr(req.kind) + .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?; + let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?; + + Ok(PagestreamBeMessage::GetSlruSegment( + PagestreamGetSlruSegmentResponse { segment }, + )) } + /// Note on "fullbackup": + /// Full basebackups should only be used for debugging purposes. + /// Originally, it was introduced to enable breaking storage format changes, + /// but that is not applicable anymore. #[allow(clippy::too_many_arguments)] - #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))] + #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))] async fn handle_basebackup_request( &mut self, pgb: &mut PostgresBackend, @@ -964,12 +1275,17 @@ impl PageServerHandler { prev_lsn: Option, full_backup: bool, gzip: bool, - ctx: RequestContext, - ) -> anyhow::Result<()> + ctx: &RequestContext, + ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - debug_assert_current_span_has_tenant_and_timeline_id(); + fn map_basebackup_error(err: BasebackupError) -> QueryError { + match err { + BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)), + BasebackupError::Server(e) => QueryError::Other(e), + } + } let started = std::time::Instant::now(); @@ -981,7 +1297,13 @@ impl PageServerHandler { if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); - timeline.wait_lsn(lsn, &ctx).await?; + timeline + .wait_lsn( + lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; @@ -990,7 +1312,8 @@ impl PageServerHandler { let lsn_awaited_after = started.elapsed(); // switch client to COPYOUT - pgb.write_message_noflush(&BeMessage::CopyOutResponse)?; + pgb.write_message_noflush(&BeMessage::CopyOutResponse) + .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &timeline.cancel).await?; // Send a tarball of the latest layer on the timeline. Compress if not @@ -1003,9 +1326,10 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; } else { let mut writer = pgb.copyout_writer(); if gzip { @@ -1024,11 +1348,15 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; // shutdown the encoder to ensure the gzip footer is written - encoder.shutdown().await?; + encoder + .shutdown() + .await + .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?; } else { basebackup::send_basebackup_tarball( &mut writer, @@ -1036,13 +1364,15 @@ impl PageServerHandler { lsn, prev_lsn, full_backup, - &ctx, + ctx, ) - .await?; + .await + .map_err(map_basebackup_error)?; } } - pgb.write_message_noflush(&BeMessage::CopyDone)?; + pgb.write_message_noflush(&BeMessage::CopyDone) + .map_err(QueryError::Disconnected)?; self.flush_cancellable(pgb, &timeline.cancel).await?; let basebackup_after = started @@ -1083,17 +1413,69 @@ impl PageServerHandler { timeline_id: TimelineId, selector: ShardSelector, ) -> Result, GetActiveTimelineError> { - let tenant = get_active_tenant_with_timeout( - tenant_id, - selector, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await - .map_err(GetActiveTimelineError::Tenant)?; + let tenant = self + .get_active_tenant_with_timeout(tenant_id, selector, ACTIVE_TENANT_TIMEOUT) + .await + .map_err(GetActiveTimelineError::Tenant)?; let timeline = tenant.get_timeline(timeline_id, true)?; + set_tracing_field_shard_id(&timeline); Ok(timeline) } + + /// Get a shard's [`Tenant`] in its active state, if present. If we don't find the shard and some + /// slots for this tenant are `InProgress` then we will wait. + /// If we find the [`Tenant`] and it's not yet in state [`TenantState::Active`], we will wait. + /// + /// `timeout` is used as a total timeout for the whole wait operation. + async fn get_active_tenant_with_timeout( + &self, + tenant_id: TenantId, + shard_selector: ShardSelector, + timeout: Duration, + ) -> Result, GetActiveTenantError> { + let wait_start = Instant::now(); + let deadline = wait_start + timeout; + + // Resolve TenantId to TenantShardId. This is usually a quick one-shot thing, the loop is + // for handling the rare case that the slot we're accessing is InProgress. + let tenant_shard = loop { + let resolved = self + .tenant_manager + .resolve_attached_shard(&tenant_id, shard_selector); + match resolved { + ShardResolveResult::Found(tenant_shard) => break tenant_shard, + ShardResolveResult::NotFound => { + return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( + tenant_id, + ))); + } + ShardResolveResult::InProgress(barrier) => { + // We can't authoritatively answer right now: wait for InProgress state + // to end, then try again + tokio::select! { + _ = self.await_connection_cancelled() => { + return Err(GetActiveTenantError::Cancelled) + }, + _ = barrier.wait() => { + // The barrier completed: proceed around the loop to try looking up again + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + return Err(GetActiveTenantError::WaitForActiveTimeout { + latest_state: None, + wait_time: timeout, + }); + } + } + } + }; + }; + + tracing::debug!("Waiting for tenant to enter active state..."); + tenant_shard + .wait_to_become_active(deadline.duration_since(Instant::now())) + .await?; + Ok(tenant_shard) + } } #[async_trait::async_trait] @@ -1135,6 +1517,7 @@ where _pgb: &mut PostgresBackend, _sm: &FeStartupPacket, ) -> Result<(), QueryError> { + fail::fail_point!("ps::connection-start::startup-packet"); Ok(()) } @@ -1149,11 +1532,12 @@ where Err(QueryError::SimulatedConnectionError) }); + fail::fail_point!("ps::connection-start::process-query"); + let ctx = self.connection_ctx.attached_child(); debug!("process query {query_string:?}"); - if query_string.starts_with("pagestream ") { - let (_, params_raw) = query_string.split_at("pagestream ".len()); - let params = params_raw.split(' ').collect::>(); + let parts = query_string.split_whitespace().collect::>(); + if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) { if params.len() != 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for pagestream command" @@ -1170,12 +1554,40 @@ where self.check_permission(Some(tenant_id))?; - self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx) - .await?; - } else if query_string.starts_with("basebackup ") { - let (_, params_raw) = query_string.split_at("basebackup ".len()); - let params = params_raw.split_whitespace().collect::>(); + self.handle_pagerequests( + pgb, + tenant_id, + timeline_id, + PagestreamProtocolVersion::V2, + ctx, + ) + .await?; + } else if let Some(params) = parts.strip_prefix(&["pagestream"]) { + if params.len() != 2 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number for pagestream command" + ))); + } + let tenant_id = TenantId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + tracing::Span::current() + .record("tenant_id", field::display(tenant_id)) + .record("timeline_id", field::display(timeline_id)); + + self.check_permission(Some(tenant_id))?; + + self.handle_pagerequests( + pgb, + tenant_id, + timeline_id, + PagestreamProtocolVersion::V1, + ctx, + ) + .await?; + } else if let Some(params) = parts.strip_prefix(&["basebackup"]) { if params.len() < 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for basebackup command" @@ -1193,53 +1605,47 @@ where self.check_permission(Some(tenant_id))?; - let lsn = if params.len() >= 3 { + let lsn = if let Some(lsn_str) = params.get(2) { Some( - Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + Lsn::from_str(lsn_str) + .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, ) } else { None }; - let gzip = if params.len() >= 4 { - if params[3] == "--gzip" { - true - } else { + let gzip = match params.get(3) { + Some(&"--gzip") => true, + None => false, + Some(third_param) => { return Err(QueryError::Other(anyhow::anyhow!( - "Parameter in position 3 unknown {}", - params[3], - ))); + "Parameter in position 3 unknown {third_param}", + ))) } - } else { - false }; - ::metrics::metric_vec_duration::observe_async_block_duration_by_result( - &*metrics::BASEBACKUP_QUERY_TIME, - async move { - self.handle_basebackup_request( - pgb, - tenant_id, - timeline_id, - lsn, - None, - false, - gzip, - ctx, - ) - .await?; - pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - anyhow::Ok(()) - }, - ) - .await?; + let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx); + let res = async { + self.handle_basebackup_request( + pgb, + tenant_id, + timeline_id, + lsn, + None, + false, + gzip, + &ctx, + ) + .await?; + pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + Result::<(), QueryError>::Ok(()) + } + .await; + metric_recording.observe(&res); + res?; } // return pair of prev_lsn and last_lsn - else if query_string.starts_with("get_last_record_rlsn ") { - let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len()); - let params = params_raw.split_whitespace().collect::>(); - + else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) { if params.len() != 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for get_last_record_rlsn command" @@ -1256,27 +1662,32 @@ where .record("timeline_id", field::display(timeline_id)); self.check_permission(Some(tenant_id))?; - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; + async { + let timeline = self + .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) + .await?; - let end_of_timeline = timeline.get_last_record_rlsn(); + let end_of_timeline = timeline.get_last_record_rlsn(); - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::text_col(b"prev_lsn"), - RowDescriptor::text_col(b"last_lsn"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(end_of_timeline.prev.to_string().as_bytes()), - Some(end_of_timeline.last.to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + pgb.write_message_noflush(&BeMessage::RowDescription(&[ + RowDescriptor::text_col(b"prev_lsn"), + RowDescriptor::text_col(b"last_lsn"), + ]))? + .write_message_noflush(&BeMessage::DataRow(&[ + Some(end_of_timeline.prev.to_string().as_bytes()), + Some(end_of_timeline.last.to_string().as_bytes()), + ]))? + .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + anyhow::Ok(()) + } + .instrument(info_span!( + "handle_get_last_record_lsn", + shard_id = tracing::field::Empty + )) + .await?; } // same as basebackup, but result includes relational data as well - else if query_string.starts_with("fullbackup ") { - let (_, params_raw) = query_string.split_at("fullbackup ".len()); - let params = params_raw.split_whitespace().collect::>(); - + else if let Some(params) = parts.strip_prefix(&["fullbackup"]) { if params.len() < 2 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for fullbackup command" @@ -1293,18 +1704,18 @@ where .record("timeline_id", field::display(timeline_id)); // The caller is responsible for providing correct lsn and prev_lsn. - let lsn = if params.len() > 2 { + let lsn = if let Some(lsn_str) = params.get(2) { Some( - Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?, + Lsn::from_str(lsn_str) + .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?, ) } else { None }; - let prev_lsn = if params.len() > 3 { + let prev_lsn = if let Some(prev_lsn_str) = params.get(3) { Some( - Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?, + Lsn::from_str(prev_lsn_str) + .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?, ) } else { None @@ -1321,7 +1732,7 @@ where prev_lsn, true, false, - ctx, + &ctx, ) .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; @@ -1337,8 +1748,7 @@ where // 2. Run: // cat my_backup/base.tar | psql -h $PAGESERVER \ // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" - let (_, params_raw) = query_string.split_at("import basebackup ".len()); - let params = params_raw.split_whitespace().collect::>(); + let params = &parts[2..]; if params.len() != 5 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for import basebackup command" @@ -1387,8 +1797,7 @@ where // // Files are scheduled to be persisted to remote storage, and the // caller should poll the http api to check when that is done. - let (_, params_raw) = query_string.split_at("import wal ".len()); - let params = params_raw.split_whitespace().collect::>(); + let params = &parts[2..]; if params.len() != 4 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for import wal command" @@ -1426,10 +1835,45 @@ where // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("show ") { + } else if query_string.starts_with("lease lsn ") { + let params = &parts[2..]; + if params.len() != 3 { + return Err(QueryError::Other(anyhow::anyhow!( + "invalid param number {} for lease lsn command", + params.len() + ))); + } + + let tenant_shard_id = TenantShardId::from_str(params[0]) + .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; + let timeline_id = TimelineId::from_str(params[1]) + .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; + + tracing::Span::current() + .record("tenant_id", field::display(tenant_shard_id)) + .record("timeline_id", field::display(timeline_id)); + + self.check_permission(Some(tenant_shard_id.tenant_id))?; + + // The caller is responsible for providing correct lsn. + let lsn = Lsn::from_str(params[2]) + .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; + + match self + .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx) + .await + { + Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, + Err(e) => { + error!("error obtaining lsn lease for {lsn}: {e:?}"); + pgb.write_message_noflush(&BeMessage::ErrorResponse( + &e.to_string(), + Some(e.pg_error_code()), + ))? + } + }; + } else if let Some(params) = parts.strip_prefix(&["show"]) { // show - let (_, params_raw) = query_string.split_at("show ".len()); - let params = params_raw.split(' ').collect::>(); if params.len() != 1 { return Err(QueryError::Other(anyhow::anyhow!( "invalid param number for config command" @@ -1442,13 +1886,13 @@ where self.check_permission(Some(tenant_id))?; - let tenant = get_active_tenant_with_timeout( - tenant_id, - ShardSelector::Zero, - ACTIVE_TENANT_TIMEOUT, - &task_mgr::shutdown_token(), - ) - .await?; + let tenant = self + .get_active_tenant_with_timeout( + tenant_id, + ShardSelector::Zero, + ACTIVE_TENANT_TIMEOUT, + ) + .await?; pgb.write_message_noflush(&BeMessage::RowDescription(&[ RowDescriptor::int8_col(b"checkpoint_distance"), RowDescriptor::int8_col(b"checkpoint_timeout"), @@ -1500,9 +1944,11 @@ impl From for QueryError { GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected( ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())), ), - GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { + GetActiveTenantError::Cancelled + | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { QueryError::Shutdown } + e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()), e => QueryError::Other(anyhow::anyhow!(e)), } } @@ -1525,3 +1971,12 @@ impl From for QueryError { } } } + +fn set_tracing_field_shard_id(timeline: &Timeline) { + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + tracing::Span::current().record( + "shard_id", + tracing::field::display(timeline.tenant_shard_id.shard_slug()), + ); + debug_assert_current_span_has_tenant_and_timeline_id(); +} diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index f11a72f2ab..25d00d6dfd 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -9,26 +9,42 @@ use super::tenant::{PageReconstructError, Timeline}; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; -use crate::repository::*; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; use crate::walrecord::NeonWalRecord; +use crate::{aux_file, repository::*}; use anyhow::{ensure, Context}; -use bytes::{Buf, Bytes}; -use pageserver_api::key::is_rel_block_key; -use pageserver_api::reltag::{RelTag, SlruKind}; +use bytes::{Buf, Bytes, BytesMut}; +use enum_map::Enum; +use itertools::Itertools; +use pageserver_api::key::{ + dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, + relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, + slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, + AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, +}; +use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::models::AuxFilePolicy; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; -use postgres_ffi::{Oid, TimestampTz, TransactionId}; +use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{hash_map, HashMap, HashSet}; use std::ops::ControlFlow; use std::ops::Range; +use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; -use tracing::{debug, trace, warn}; +use tracing::{debug, info, trace, warn}; use utils::bin_ser::DeserializeError; +use utils::pausable_failpoint; +use utils::vec_map::{VecMap, VecMapOrdering}; use utils::{bin_ser::BeSer, lsn::Lsn}; -/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. -pub type BlockNumber = u32; +/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. +pub const MAX_AUX_FILE_DELTAS: usize = 1024; + +/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached. +pub const MAX_AUX_FILE_V2_DELTAS: usize = 64; #[derive(Debug)] pub enum LsnForTimestamp { @@ -63,11 +79,19 @@ pub enum LsnForTimestamp { } #[derive(Debug, thiserror::Error)] -pub enum CalculateLogicalSizeError { +pub(crate) enum CalculateLogicalSizeError { #[error("cancelled")] Cancelled, + + /// Something went wrong while reading the metadata we use to calculate logical size + /// Note that cancellation variants of `PageReconstructError` are transformed to [`Self::Cancelled`] + /// in the `From` implementation for this variant. #[error(transparent)] - Other(#[from] anyhow::Error), + PageRead(PageReconstructError), + + /// Something went wrong deserializing metadata that we read to calculate logical size + #[error("decode error: {0}")] + Decode(#[from] DeserializeError), } #[derive(Debug, thiserror::Error)] @@ -92,10 +116,8 @@ impl From for CollectKeySpaceError { impl From for CalculateLogicalSizeError { fn from(pre: PageReconstructError) -> Self { match pre { - PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => { - Self::Cancelled - } - _ => Self::Other(pre.into()), + PageReconstructError::Cancelled => Self::Cancelled, + _ => Self::PageRead(pre), } } } @@ -151,6 +173,7 @@ impl Timeline { pending_updates: HashMap::new(), pending_deletions: Vec::new(), pending_nblocks: 0, + pending_directory_entries: Vec::new(), lsn, } } @@ -165,7 +188,6 @@ impl Timeline { tag: RelTag, blknum: BlockNumber, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -174,7 +196,7 @@ impl Timeline { )); } - let nblocks = self.get_rel_size(tag, version, latest, ctx).await?; + let nblocks = self.get_rel_size(tag, version, ctx).await?; if blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", @@ -196,7 +218,6 @@ impl Timeline { spcnode: Oid, dbnode: Oid, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { let mut total_blocks = 0; @@ -204,7 +225,7 @@ impl Timeline { let rels = self.list_rels(spcnode, dbnode, version, ctx).await?; for rel in rels { - let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?; + let n_blocks = self.get_rel_size(rel, version, ctx).await?; total_blocks += n_blocks as usize; } Ok(total_blocks) @@ -215,7 +236,6 @@ impl Timeline { &self, tag: RelTag, version: Version<'_>, - latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -229,7 +249,7 @@ impl Timeline { } if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM) - && !self.get_rel_exists(tag, version, latest, ctx).await? + && !self.get_rel_exists(tag, version, ctx).await? { // FIXME: Postgres sometimes calls smgrcreate() to create // FSM, and smgrnblocks() on it immediately afterwards, @@ -242,16 +262,8 @@ impl Timeline { let mut buf = version.get(self, key, ctx).await?; let nblocks = buf.get_u32_le(); - if latest { - // Update relation size cache only if "latest" flag is set. - // This flag is set by compute when it is working with most recent version of relation. - // Typically master compute node always set latest=true. - // Please notice, that even if compute node "by mistake" specifies old LSN but set - // latest=true, then it can not cause cache corruption, because with latest=true - // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be - // associated with most recent value of LSN. - self.update_cached_rel_size(tag, version.get_lsn(), nblocks); - } + self.update_cached_rel_size(tag, version.get_lsn(), nblocks); + Ok(nblocks) } @@ -260,7 +272,6 @@ impl Timeline { &self, tag: RelTag, version: Version<'_>, - _latest: bool, ctx: &RequestContext, ) -> Result { if tag.relnode == 0 { @@ -279,7 +290,7 @@ impl Timeline { match RelDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { - let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + let exists = dir.rels.contains(&(tag.relnode, tag.forknum)); Ok(exists) } Err(e) => Err(PageReconstructError::from(e)), @@ -318,6 +329,27 @@ impl Timeline { } } + /// Get the whole SLRU segment + pub(crate) async fn get_slru_segment( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result { + let n_blocks = self + .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) + .await?; + let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); + for blkno in 0..n_blocks { + let block = self + .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx) + .await?; + segment.extend_from_slice(&block[..BLCKSZ as usize]); + } + Ok(segment.freeze()) + } + /// Look up given SLRU page version. pub(crate) async fn get_slru_page_at_lsn( &self, @@ -358,7 +390,7 @@ impl Timeline { match SlruSegmentDirectory::des(&buf).context("deserialization failure") { Ok(dir) => { - let exists = dir.segments.get(&segno).is_some(); + let exists = dir.segments.contains(&segno); Ok(exists) } Err(e) => Err(PageReconstructError::from(e)), @@ -378,6 +410,8 @@ impl Timeline { cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { + pausable_failpoint!("find-lsn-for-timestamp-pausable"); + let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn(); // We use this method to figure out the branching LSN for the new branch, but the // GC cutoff could be before the branching point and we cannot create a new branch @@ -393,6 +427,7 @@ impl Timeline { let mut found_smaller = false; let mut found_larger = false; + while low < high { if cancel.is_cancelled() { return Err(PageReconstructError::Cancelled); @@ -435,6 +470,12 @@ impl Timeline { // Didn't find any commit timestamps smaller than the request Ok(LsnForTimestamp::Past(min_lsn)) } + (true, _) if commit_lsn < min_lsn => { + // the search above did set found_smaller to true but it never increased the lsn. + // Then, low is still the old min_lsn, and the subtraction above gave a value + // below the min_lsn. We should never do that. + Ok(LsnForTimestamp::Past(min_lsn)) + } (true, false) => { // Only found commits with timestamps smaller than the request. // It's still a valid case for branch creation, return it. @@ -531,6 +572,33 @@ impl Timeline { Ok(Default::default()) } + pub(crate) async fn get_slru_keyspace( + &self, + version: Version<'_>, + ctx: &RequestContext, + ) -> Result { + let mut accum = KeySpaceAccum::new(); + + for kind in SlruKind::iter() { + let mut segments: Vec = self + .list_slru_segments(kind, version, ctx) + .await? + .into_iter() + .collect(); + segments.sort_unstable(); + + for seg in segments { + let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?; + + accum.add_range( + slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count), + ); + } + } + + Ok(accum.to_keyspace()) + } + /// Get a list of SLRU segments pub(crate) async fn list_slru_segments( &self, @@ -616,7 +684,7 @@ impl Timeline { self.get(CHECKPOINT_KEY, lsn, ctx).await } - pub(crate) async fn list_aux_files( + async fn list_aux_files_v1( &self, lsn: Lsn, ctx: &RequestContext, @@ -634,6 +702,101 @@ impl Timeline { } } + async fn list_aux_files_v2( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let kv = self + .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx) + .await + .context("scan")?; + let mut result = HashMap::new(); + let mut sz = 0; + for (_, v) in kv { + let v = v.context("get value")?; + let v = aux_file::decode_file_value_bytes(&v).context("value decode")?; + for (fname, content) in v { + sz += fname.len(); + sz += content.len(); + result.insert(fname, content); + } + } + self.aux_file_size_estimator.on_initial(sz); + Ok(result) + } + + pub(crate) async fn trigger_aux_file_size_computation( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result<(), PageReconstructError> { + let current_policy = self.last_aux_file_policy.load(); + if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy { + self.list_aux_files_v2(lsn, ctx).await?; + } + Ok(()) + } + + pub(crate) async fn list_aux_files( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let current_policy = self.last_aux_file_policy.load(); + match current_policy { + Some(AuxFilePolicy::V1) | None => self.list_aux_files_v1(lsn, ctx).await, + Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await, + Some(AuxFilePolicy::CrossValidation) => { + let v1_result = self.list_aux_files_v1(lsn, ctx).await; + let v2_result = self.list_aux_files_v2(lsn, ctx).await; + match (v1_result, v2_result) { + (Ok(v1), Ok(v2)) => { + if v1 != v2 { + tracing::error!( + "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}" + ); + return Err(PageReconstructError::Other(anyhow::anyhow!( + "unmatched aux file v1 v2 result" + ))); + } + Ok(v1) + } + (Ok(_), Err(v2)) => { + tracing::error!("aux file v1 returns Ok while aux file v2 returns an err"); + Err(v2) + } + (Err(v1), Ok(_)) => { + tracing::error!("aux file v2 returns Ok while aux file v1 returns an err"); + Err(v1) + } + (Err(_), Err(v2)) => Err(v2), + } + } + } + } + + pub(crate) async fn get_replorigins( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, PageReconstructError> { + let kv = self + .scan(KeySpace::single(repl_origin_key_range()), lsn, ctx) + .await + .context("scan")?; + let mut result = HashMap::new(); + for (k, v) in kv { + let v = v.context("get value")?; + let origin_id = k.field6 as RepOriginId; + let origin_lsn = Lsn::des(&v).unwrap(); + if origin_lsn != Lsn::INVALID { + result.insert(origin_id, origin_lsn); + } + } + Ok(result) + } + /// Does the same as get_current_logical_size but counted on demand. /// Used to initialize the logical size tracking on startup. /// @@ -643,16 +806,16 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn get_current_logical_size_non_incremental( + pub(crate) async fn get_current_logical_size_non_incremental( &self, lsn: Lsn, ctx: &RequestContext, ) -> Result { - crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); // Fetch list of database dirs and iterate them let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?; + let dbdir = DbDirectory::des(&buf)?; let mut total_size: u64 = 0; for (spcnode, dbnode) in dbdir.dbdirs.keys() { @@ -677,11 +840,13 @@ impl Timeline { /// Get a KeySpace that covers all the Keys that are in use at the given LSN. /// Anything that's not listed maybe removed from the underlying storage (from /// that LSN forwards). + /// + /// The return value is (dense keyspace, sparse keyspace). pub(crate) async fn collect_keyspace( &self, lsn: Lsn, ctx: &RequestContext, - ) -> Result { + ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); @@ -753,13 +918,28 @@ impl Timeline { if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() { result.add_key(AUX_FILES_KEY); } - Ok(result.to_keyspace()) + + #[cfg(test)] + { + let guard = self.extra_test_dense_keyspace.load(); + for kr in &guard.ranges { + result.add_range(kr.clone()); + } + } + + Ok(( + result.to_keyspace(), + /* AUX sparse key space */ + SparseKeySpace(KeySpace { + ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()], + }), + )) } /// Get cached size of relation if it not updated after specified LSN pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option { let rel_size_cache = self.rel_size_cache.read().unwrap(); - if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) { + if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) { if lsn >= *cached_lsn { return Some(*nblocks); } @@ -770,7 +950,16 @@ impl Timeline { /// Update cached relation size if there is no more recent update pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - match rel_size_cache.entry(tag) { + + if lsn < rel_size_cache.complete_as_of { + // Do not cache old values. It's safe to cache the size on read, as long as + // the read was at an LSN since we started the WAL ingestion. Reasoning: we + // never evict values from the cache, so if the relation size changed after + // 'lsn', the new value is already in the cache. + return; + } + + match rel_size_cache.map.entry(tag) { hash_map::Entry::Occupied(mut entry) => { let cached_lsn = entry.get_mut(); if lsn >= cached_lsn.0 { @@ -786,13 +975,13 @@ impl Timeline { /// Store cached relation size pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.insert(tag, (lsn, nblocks)); + rel_size_cache.map.insert(tag, (lsn, nblocks)); } /// Remove cached relation size pub fn remove_cached_rel_size(&self, tag: &RelTag) { let mut rel_size_cache = self.rel_size_cache.write().unwrap(); - rel_size_cache.remove(tag); + rel_size_cache.map.remove(tag); } } @@ -816,6 +1005,10 @@ pub struct DatadirModification<'a> { pending_updates: HashMap>, pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, + + /// For special "directory" keys that store key-value maps, track the size of the map + /// if it was updated in this modification. + pending_directory_entries: Vec<(DirectoryKind, usize)>, } impl<'a> DatadirModification<'a> { @@ -847,6 +1040,7 @@ impl<'a> DatadirModification<'a> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), })?; + self.pending_directory_entries.push((DirectoryKind::Db, 0)); self.put(DBDIR_KEY, Value::Image(buf.into())); // Create AuxFilesDirectory @@ -855,16 +1049,24 @@ impl<'a> DatadirModification<'a> { let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { xids: HashSet::new(), })?; + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, 0)); self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); let empty_dir = Value::Image(buf); self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); self.put( slru_dir_to_key(SlruKind::MultiXactMembers), empty_dir.clone(), ); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0)); self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0)); Ok(()) } @@ -965,6 +1167,7 @@ impl<'a> DatadirModification<'a> { let buf = RelDirectory::ser(&RelDirectory { rels: HashSet::new(), })?; + self.pending_directory_entries.push((DirectoryKind::Rel, 0)); self.put( rel_dir_to_key(spcnode, dbnode), Value::Image(Bytes::from(buf)), @@ -987,6 +1190,8 @@ impl<'a> DatadirModification<'a> { if !dir.xids.insert(xid) { anyhow::bail!("twophase file for xid {} already exists", xid); } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); self.put( TWOPHASEDIR_KEY, Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), @@ -996,6 +1201,20 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub async fn set_replorigin( + &mut self, + origin_id: RepOriginId, + origin_lsn: Lsn, + ) -> anyhow::Result<()> { + let key = repl_origin_key(origin_id); + self.put(key, Value::Image(origin_lsn.ser().unwrap().into())); + Ok(()) + } + + pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> { + self.set_replorigin(origin_id, Lsn::INVALID).await + } + pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) @@ -1014,7 +1233,7 @@ impl<'a> DatadirModification<'a> { ) -> anyhow::Result<()> { let total_blocks = self .tline - .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx) + .get_db_size(spcnode, dbnode, Version::Modified(self), ctx) .await?; // Remove entry from dbdir @@ -1022,6 +1241,8 @@ impl<'a> DatadirModification<'a> { let mut dir = DbDirectory::des(&buf)?; if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; + self.pending_directory_entries + .push((DirectoryKind::Db, dir.dbdirs.len())); self.put(DBDIR_KEY, Value::Image(buf.into())); } else { warn!( @@ -1055,24 +1276,31 @@ impl<'a> DatadirModification<'a> { let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) .context("deserialize db")?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { - // Didn't exist. Update dbdir - dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); - let buf = DbDirectory::ser(&dbdir).context("serialize db")?; - self.put(DBDIR_KEY, Value::Image(buf.into())); + let mut rel_dir = + if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { + // Didn't exist. Update dbdir + e.insert(false); + let buf = DbDirectory::ser(&dbdir).context("serialize db")?; + self.pending_directory_entries + .push((DirectoryKind::Db, dbdir.dbdirs.len())); + self.put(DBDIR_KEY, Value::Image(buf.into())); - // and create the RelDirectory - RelDirectory::default() - } else { - // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? - }; + // and create the RelDirectory + RelDirectory::default() + } else { + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) + .context("deserialize db")? + }; // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { return Err(RelationError::AlreadyExists); } + + self.pending_directory_entries + .push((DirectoryKind::Rel, rel_dir.rels.len())); + self.put( rel_dir_key, Value::Image(Bytes::from( @@ -1105,7 +1333,7 @@ impl<'a> DatadirModification<'a> { anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); if self .tline - .get_rel_exists(rel, Version::Modified(self), true, ctx) + .get_rel_exists(rel, Version::Modified(self), ctx) .await? { let size_key = rel_size_to_key(rel); @@ -1164,6 +1392,9 @@ impl<'a> DatadirModification<'a> { let buf = self.get(dir_key, ctx).await?; let mut dir = RelDirectory::des(&buf)?; + self.pending_directory_entries + .push((DirectoryKind::Rel, dir.rels.len())); + if dir.rels.remove(&(rel.relnode, rel.forknum)) { self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); } else { @@ -1199,6 +1430,8 @@ impl<'a> DatadirModification<'a> { if !dir.segments.insert(segno) { anyhow::bail!("slru segment {kind:?}/{segno} already exists"); } + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1243,6 +1476,8 @@ impl<'a> DatadirModification<'a> { if !dir.segments.remove(&segno) { warn!("slru segment {:?}/{} does not exist", kind, segno); } + self.pending_directory_entries + .push((DirectoryKind::SlruSegment(kind), dir.segments.len())); self.put( dir_key, Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), @@ -1273,6 +1508,8 @@ impl<'a> DatadirModification<'a> { if !dir.xids.remove(&xid) { warn!("twophase file for xid {} does not exist", xid); } + self.pending_directory_entries + .push((DirectoryKind::TwoPhase, dir.xids.len())); self.put( TWOPHASEDIR_KEY, Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), @@ -1285,9 +1522,14 @@ impl<'a> DatadirModification<'a> { } pub fn init_aux_dir(&mut self) -> anyhow::Result<()> { + if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() { + return Ok(()); + } let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { files: HashMap::new(), })?; + self.pending_directory_entries + .push((DirectoryKind::AuxFiles, 0)); self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); Ok(()) } @@ -1298,28 +1540,176 @@ impl<'a> DatadirModification<'a> { content: &[u8], ctx: &RequestContext, ) -> anyhow::Result<()> { - let mut dir = match self.get(AUX_FILES_KEY, ctx).await { - Ok(buf) => AuxFilesDirectory::des(&buf)?, - Err(e) => { - // This is expected: historical databases do not have the key. - debug!("Failed to get info about AUX files: {}", e); - AuxFilesDirectory { - files: HashMap::new(), + let switch_policy = self.tline.get_switch_aux_file_policy(); + + let policy = { + let current_policy = self.tline.last_aux_file_policy.load(); + // Allowed switch path: + // * no aux files -> v1/v2/cross-validation + // * cross-validation->v2 + + let current_policy = if current_policy.is_none() { + // This path will only be hit once per tenant: we will decide the final policy in this code block. + // The next call to `put_file` will always have `last_aux_file_policy != None`. + let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); + let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?; + if aux_files_key_v1.is_empty() { + None + } else { + self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?; + Some(AuxFilePolicy::V1) } + } else { + current_policy + }; + + if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) { + self.tline.do_switch_aux_policy(switch_policy)?; + info!(current=?current_policy, next=?switch_policy, "switching aux file policy"); + switch_policy + } else { + // This branch handles non-valid migration path, and the case that switch_policy == current_policy. + // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit. + current_policy.unwrap_or(AuxFilePolicy::default_tenant_config()) } }; - let path = path.to_string(); - if content.is_empty() { - dir.files.remove(&path); - } else { - dir.files.insert(path, Bytes::copy_from_slice(content)); + + if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy { + let key = aux_file::encode_aux_file_key(path); + // retrieve the key from the engine + let old_val = match self.get(key, ctx).await { + Ok(val) => Some(val), + Err(PageReconstructError::MissingKey(_)) => None, + Err(e) => return Err(e.into()), + }; + let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { + aux_file::decode_file_value(old_val)? + } else { + Vec::new() + }; + let mut other_files = Vec::with_capacity(files.len()); + let mut modifying_file = None; + for file @ (p, content) in files { + if path == p { + assert!( + modifying_file.is_none(), + "duplicated entries found for {}", + path + ); + modifying_file = Some(content); + } else { + other_files.push(file); + } + } + let mut new_files = other_files; + match (modifying_file, content.is_empty()) { + (Some(old_content), false) => { + self.tline + .aux_file_size_estimator + .on_update(old_content.len(), content.len()); + new_files.push((path, content)); + } + (Some(old_content), true) => { + self.tline + .aux_file_size_estimator + .on_remove(old_content.len()); + // not adding the file key to the final `new_files` vec. + } + (None, false) => { + self.tline.aux_file_size_estimator.on_add(content.len()); + new_files.push((path, content)); + } + (None, true) => warn!("removing non-existing aux file: {}", path), + } + let new_val = aux_file::encode_file_value(&new_files)?; + self.put(key, Value::Image(new_val.into())); } - self.put( - AUX_FILES_KEY, - Value::Image(Bytes::from( - AuxFilesDirectory::ser(&dir).context("serialize")?, - )), - ); + + if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy { + let file_path = path.to_string(); + let content = if content.is_empty() { + None + } else { + Some(Bytes::copy_from_slice(content)) + }; + + let n_files; + let mut aux_files = self.tline.aux_files.lock().await; + if let Some(mut dir) = aux_files.dir.take() { + // We already updated aux files in `self`: emit a delta and update our latest value. + dir.upsert(file_path.clone(), content.clone()); + n_files = dir.files.len(); + if aux_files.n_deltas == MAX_AUX_FILE_DELTAS { + self.put( + AUX_FILES_KEY, + Value::Image(Bytes::from( + AuxFilesDirectory::ser(&dir).context("serialize")?, + )), + ); + aux_files.n_deltas = 0; + } else { + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }), + ); + aux_files.n_deltas += 1; + } + aux_files.dir = Some(dir); + } else { + // Check if the AUX_FILES_KEY is initialized + match self.get(AUX_FILES_KEY, ctx).await { + Ok(dir_bytes) => { + let mut dir = AuxFilesDirectory::des(&dir_bytes)?; + // Key is already set, we may append a delta + self.put( + AUX_FILES_KEY, + Value::WalRecord(NeonWalRecord::AuxFile { + file_path: file_path.clone(), + content: content.clone(), + }), + ); + dir.upsert(file_path, content); + n_files = dir.files.len(); + aux_files.dir = Some(dir); + } + Err( + e @ (PageReconstructError::Cancelled + | PageReconstructError::AncestorLsnTimeout(_)), + ) => { + // Important that we do not interpret a shutdown error as "not found" and thereby + // reset the map. + return Err(e.into()); + } + // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but + // the original code assumes all other errors are missing keys. Therefore, we keep the code path + // the same for now, though in theory, we should only match the `MissingKey` variant. + Err( + PageReconstructError::Other(_) + | PageReconstructError::WalRedo(_) + | PageReconstructError::MissingKey { .. }, + ) => { + // Key is missing, we must insert an image as the basis for subsequent deltas. + + let mut dir = AuxFilesDirectory { + files: HashMap::new(), + }; + dir.upsert(file_path, content); + self.put( + AUX_FILES_KEY, + Value::Image(Bytes::from( + AuxFilesDirectory::ser(&dir).context("serialize")?, + )), + ); + n_files = 1; + aux_files.dir = Some(dir); + } + } + } + + self.pending_directory_entries + .push((DirectoryKind::AuxFiles, n_files)); + } + Ok(()) } @@ -1349,13 +1739,13 @@ impl<'a> DatadirModification<'a> { return Ok(()); } - let writer = self.tline.writer().await; + let mut writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); for (key, values) in self.pending_updates.drain() { for (lsn, value) in values { - if is_rel_block_key(&key) || is_slru_block_key(key) { + if key.is_rel_block_key() || key.is_slru_block_key() { // This bails out on first error without modifying pending_updates. // That's Ok, cf this function's doc comment. writer.put(key, lsn, &value, ctx).await?; @@ -1375,6 +1765,10 @@ impl<'a> DatadirModification<'a> { self.pending_nblocks = 0; } + for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { + writer.update_directory_entries_count(kind, count as u64); + } + Ok(()) } @@ -1384,18 +1778,27 @@ impl<'a> DatadirModification<'a> { /// All the modifications in this atomic update are stamped by the specified LSN. /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { - let writer = self.tline.writer().await; + let mut writer = self.tline.writer().await; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; if !self.pending_updates.is_empty() { - writer.put_batch(&self.pending_updates, ctx).await?; - self.pending_updates.clear(); + // The put_batch call below expects expects the inputs to be sorted by Lsn, + // so we do that first. + let lsn_ordered_batch: VecMap = VecMap::from_iter( + self.pending_updates + .drain() + .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val)))) + .kmerge_by(|lhs, rhs| lhs.0 < rhs.0), + VecMapOrdering::GreaterOrEqual, + ); + + writer.put_batch(lsn_ordered_batch, ctx).await?; } if !self.pending_deletions.is_empty() { - writer.delete_batch(&self.pending_deletions).await?; + writer.delete_batch(&self.pending_deletions, ctx).await?; self.pending_deletions.clear(); } @@ -1412,6 +1815,10 @@ impl<'a> DatadirModification<'a> { writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ)); } + for (kind, count) in std::mem::take(&mut self.pending_directory_entries) { + writer.update_directory_entries_count(kind, count as u64); + } + Ok(()) } @@ -1447,6 +1854,12 @@ impl<'a> DatadirModification<'a> { self.tline.get(key, lsn, ctx).await } + /// Only used during unit tests, force putting a key into the modification. + #[cfg(test)] + pub(crate) fn put_for_test(&mut self, key: Key, val: Value) { + self.put(key, val); + } + fn put(&mut self, key: Key, val: Value) { let values = self.pending_updates.entry(key).or_default(); // Replace the previous value if it exists at the same lsn @@ -1520,9 +1933,19 @@ struct RelDirectory { rels: HashSet<(Oid, u8)>, } -#[derive(Debug, Serialize, Deserialize, Default)] -struct AuxFilesDirectory { - files: HashMap, +#[derive(Debug, Serialize, Deserialize, Default, PartialEq)] +pub(crate) struct AuxFilesDirectory { + pub(crate) files: HashMap, +} + +impl AuxFilesDirectory { + pub(crate) fn upsert(&mut self, key: String, value: Option) { + if let Some(value) = value { + self.files.insert(key, value); + } else { + self.files.remove(&key); + } + } } #[derive(Debug, Serialize, Deserialize)] @@ -1536,388 +1959,82 @@ struct SlruSegmentDirectory { segments: HashSet, } +#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)] +#[repr(u8)] +pub(crate) enum DirectoryKind { + Db, + TwoPhase, + Rel, + AuxFiles, + SlruSegment(SlruKind), +} + +impl DirectoryKind { + pub(crate) const KINDS_NUM: usize = ::LENGTH; + pub(crate) fn offset(&self) -> usize { + self.into_usize() + } +} + static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); -// Layout of the Key address space -// -// The Key struct, used to address the underlying key-value store, consists of -// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map -// all the data and metadata keys into those 18 bytes. -// -// Principles for the mapping: -// -// - Things that are often accessed or modified together, should be close to -// each other in the key space. For example, if a relation is extended by one -// block, we create a new key-value pair for the block data, and update the -// relation size entry. Because of that, the RelSize key comes after all the -// RelBlocks of a relation: the RelSize and the last RelBlock are always next -// to each other. -// -// The key space is divided into four major sections, identified by the first -// byte, and the form a hierarchy: -// -// 00 Relation data and metadata -// -// DbDir () -> (dbnode, spcnode) -// Filenodemap -// RelDir -> relnode forknum -// RelBlocks -// RelSize -// -// 01 SLRUs -// -// SlruDir kind -// SlruSegBlocks segno -// SlruSegSize -// -// 02 pg_twophase -// -// 03 misc -// Controlfile -// checkpoint -// pg_version -// -// 04 aux files -// -// Below is a full list of the keyspace allocation: -// -// DbDir: -// 00 00000000 00000000 00000000 00 00000000 -// -// Filenodemap: -// 00 SPCNODE DBNODE 00000000 00 00000000 -// -// RelDir: -// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) -// -// RelBlock: -// 00 SPCNODE DBNODE RELNODE FORK BLKNUM -// -// RelSize: -// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF -// -// SlruDir: -// 01 kind 00000000 00000000 00 00000000 -// -// SlruSegBlock: -// 01 kind 00000001 SEGNO 00 BLKNUM -// -// SlruSegSize: -// 01 kind 00000001 SEGNO 00 FFFFFFFF -// -// TwoPhaseDir: -// 02 00000000 00000000 00000000 00 00000000 -// -// TwoPhaseFile: -// 02 00000000 00000000 00000000 00 XID -// -// ControlFile: -// 03 00000000 00000000 00000000 00 00000000 -// -// Checkpoint: -// 03 00000000 00000000 00000000 00 00000001 -// -// AuxFiles: -// 03 00000000 00000000 00000000 00 00000002 -// - -//-- Section 01: relation data and metadata - -const DBDIR_KEY: Key = Key { - field1: 0x00, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 0, - }..Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0xffffffff, - field5: 0xff, - field6: 0xffffffff, - } -} - -fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 0, - } -} - -fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 1, - } -} - -pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { - Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: blknum, - } -} - -fn rel_size_to_key(rel: RelTag) -> Key { - Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: 0xffffffff, - } -} - -fn rel_key_range(rel: RelTag) -> Range { - Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: 0, - }..Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum + 1, - field6: 0, - } -} - -//-- Section 02: SLRUs - -fn slru_dir_to_key(kind: SlruKind) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 0, - field4: 0, - field5: 0, - field6: 0, - } -} - -fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 1, - field4: segno, - field5: 0, - field6: blknum, - } -} - -fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 1, - field4: segno, - field5: 0, - field6: 0xffffffff, - } -} - -fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { - let field2 = match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }; - - Key { - field1: 0x01, - field2, - field3: 1, - field4: segno, - field5: 0, - field6: 0, - }..Key { - field1: 0x01, - field2, - field3: 1, - field4: segno, - field5: 1, - field6: 0, - } -} - -//-- Section 03: pg_twophase - -const TWOPHASEDIR_KEY: Key = Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -fn twophase_file_key(xid: TransactionId) -> Key { - Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: xid, - } -} - -fn twophase_key_range(xid: TransactionId) -> Range { - let (next_xid, overflowed) = xid.overflowing_add(1); - - Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: xid, - }..Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: u8::from(overflowed), - field6: next_xid, - } -} - -//-- Section 03: Control file -const CONTROLFILE_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -const CHECKPOINT_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 1, -}; - -const AUX_FILES_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 2, -}; - -// Reverse mappings for a few Keys. -// These are needed by WAL redo manager. - -// AUX_FILES currently stores only data for logical replication (slots etc), and -// we don't preserve these on a branch because safekeepers can't follow timeline -// switch (and generally it likely should be optional), so ignore these. -pub fn is_inherited_key(key: Key) -> bool { - key != AUX_FILES_KEY -} - -/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`. -pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { - Ok(match key.field1 { - 0x00 => ( - RelTag { - spcnode: key.field2, - dbnode: key.field3, - relnode: key.field4, - forknum: key.field5, - }, - key.field6, - ), - _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), - }) -} -pub fn is_rel_fsm_block_key(key: Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff -} - -pub fn is_rel_vm_block_key(key: Key) -> bool { - key.field1 == 0x00 - && key.field4 != 0 - && key.field5 == VISIBILITYMAP_FORKNUM - && key.field6 != 0xffffffff -} - -pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { - Ok(match key.field1 { - 0x01 => { - let kind = match key.field2 { - 0x00 => SlruKind::Clog, - 0x01 => SlruKind::MultiXactMembers, - 0x02 => SlruKind::MultiXactOffsets, - _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), - }; - let segno = key.field4; - let blknum = key.field6; - - (kind, segno, blknum) - } - _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), - }) -} - -fn is_slru_block_key(key: Key) -> bool { - key.field1 == 0x01 // SLRU-related - && key.field3 == 0x00000001 // but not SlruDir - && key.field6 != 0xffffffff // and not SlruSegSize -} - #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { - //use super::repo_harness::*; - //use super::*; + use hex_literal::hex; + use utils::id::TimelineId; + + use super::*; + + use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION}; + + /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline + #[tokio::test] + async fn aux_files_round_trip() -> anyhow::Result<()> { + let name = "aux_files_round_trip"; + let harness = TenantHarness::create(name)?; + + pub const TIMELINE_ID: TimelineId = + TimelineId::from_array(hex!("11223344556677881122334455667788")); + + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) + .await?; + let tline = tline.raw_timeline().unwrap(); + + // First modification: insert two keys + let mut modification = tline.begin_modification(Lsn(0x1000)); + modification.put_file("foo/bar1", b"content1", &ctx).await?; + modification.set_lsn(Lsn(0x1008))?; + modification.put_file("foo/bar2", b"content2", &ctx).await?; + modification.commit(&ctx).await?; + let expect_1008 = HashMap::from([ + ("foo/bar1".to_string(), Bytes::from_static(b"content1")), + ("foo/bar2".to_string(), Bytes::from_static(b"content2")), + ]); + + let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + assert_eq!(readback, expect_1008); + + // Second modification: update one key, remove the other + let mut modification = tline.begin_modification(Lsn(0x2000)); + modification.put_file("foo/bar1", b"content3", &ctx).await?; + modification.set_lsn(Lsn(0x2008))?; + modification.put_file("foo/bar2", b"", &ctx).await?; + modification.commit(&ctx).await?; + let expect_2008 = + HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]); + + let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?; + assert_eq!(readback, expect_2008); + + // Reading back in time works + let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?; + assert_eq!(readback, expect_1008); + + Ok(()) + } /* fn assert_current_logical_size(timeline: &DatadirTimeline, lsn: Lsn) { diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index c726139524..5a334d0290 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -33,11 +33,53 @@ impl Value { } } +#[derive(Debug, PartialEq)] +pub(crate) enum InvalidInput { + TooShortValue, + TooShortPostgresRecord, +} + +/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets +/// use this type for querying if a slice looks some particular way. +pub(crate) struct ValueBytes; + +impl ValueBytes { + pub(crate) fn will_init(raw: &[u8]) -> Result { + if raw.len() < 12 { + return Err(InvalidInput::TooShortValue); + } + + let value_discriminator = &raw[0..4]; + + if value_discriminator == [0, 0, 0, 0] { + // Value::Image always initializes + return Ok(true); + } + + if value_discriminator != [0, 0, 0, 1] { + // not a Value::WalRecord(..) + return Ok(false); + } + + let walrecord_discriminator = &raw[4..8]; + + if walrecord_discriminator != [0, 0, 0, 0] { + // only NeonWalRecord::Postgres can have will_init + return Ok(false); + } + + if raw.len() < 17 { + return Err(InvalidInput::TooShortPostgresRecord); + } + + Ok(raw[8] == 1) + } +} + #[cfg(test)] mod test { use super::*; - use bytes::Bytes; use utils::bin_ser::BeSer; macro_rules! roundtrip { @@ -71,6 +113,8 @@ mod test { ]; roundtrip!(image, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); } #[test] @@ -94,6 +138,96 @@ mod test { ]; roundtrip!(rec, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); + } + + #[test] + fn bytes_inspection_too_short_image() { + let rec = Value::Image(Bytes::from_static(b"")); + + #[rustfmt::skip] + let expected = [ + // top level discriminator of 4 bytes + 0x00, 0x00, 0x00, 0x00, + // 8 byte length + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + + roundtrip!(rec, expected); + + assert!(ValueBytes::will_init(&expected).unwrap()); + assert_eq!(expected.len(), 12); + for len in 0..12 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortValue + ); + } + } + + #[test] + fn bytes_inspection_too_short_postgres_record() { + let rec = NeonWalRecord::Postgres { + will_init: false, + rec: Bytes::from_static(b""), + }; + let rec = Value::WalRecord(rec); + + #[rustfmt::skip] + let expected = [ + // flattened discriminator of total 8 bytes + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, + // will_init + 0x00, + // 8 byte length + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + + roundtrip!(rec, expected); + + assert!(!ValueBytes::will_init(&expected).unwrap()); + assert_eq!(expected.len(), 17); + for len in 12..17 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortPostgresRecord + ) + } + for len in 0..12 { + assert_eq!( + ValueBytes::will_init(&expected[..len]).unwrap_err(), + InvalidInput::TooShortValue + ) + } + } + + #[test] + fn clear_visibility_map_flags_example() { + let rec = NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: Some(0x11), + old_heap_blkno: None, + flags: 0x03, + }; + let rec = Value::WalRecord(rec); + + #[rustfmt::skip] + let expected = [ + // discriminators + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x01, + // Some == 1 followed by 4 bytes + 0x01, 0x00, 0x00, 0x00, 0x11, + // None == 0 + 0x00, + // flags + 0x03 + ]; + + roundtrip!(rec, expected); + + assert!(!ValueBytes::will_init(&expected).unwrap()); } } @@ -106,6 +240,7 @@ pub struct GcResult { pub layers_needed_by_cutoff: u64, pub layers_needed_by_pitr: u64, pub layers_needed_by_branches: u64, + pub layers_needed_by_leases: u64, pub layers_not_updated: u64, pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. @@ -135,6 +270,7 @@ impl AddAssign for GcResult { self.layers_needed_by_pitr += other.layers_needed_by_pitr; self.layers_needed_by_cutoff += other.layers_needed_by_cutoff; self.layers_needed_by_branches += other.layers_needed_by_branches; + self.layers_needed_by_leases += other.layers_needed_by_leases; self.layers_not_updated += other.layers_not_updated; self.layers_removed += other.layers_removed; diff --git a/pageserver/src/span.rs b/pageserver/src/span.rs new file mode 100644 index 0000000000..91fee50514 --- /dev/null +++ b/pageserver/src/span.rs @@ -0,0 +1,43 @@ +use utils::tracing_span_assert::check_fields_present; + +mod extractors { + use utils::tracing_span_assert::ConstExtractor; + + pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id"); + pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id"); + pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id"); +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_id() { + if cfg!(debug_assertions) { + if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID]) + { + panic!("missing extractors: {missing:?}") + } + } +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { + if cfg!(debug_assertions) { + if let Err(missing) = check_fields_present!([ + &extractors::TENANT_ID, + &extractors::SHARD_ID, + &extractors::TIMELINE_ID, + ]) { + panic!("missing extractors: {missing:?}") + } + } +} + +#[track_caller] +pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() { + if cfg!(debug_assertions) { + if let Err(missing) = + check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,]) + { + panic!("missing extractors: {missing:?}") + } + } +} diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 5a06a97525..5f46ce3d69 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -30,20 +30,17 @@ //! only a single tenant or timeline. //! -// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro. -// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224. -#![allow(clippy::declare_interior_mutable_const)] - use std::collections::HashMap; use std::fmt; use std::future::Future; +use std::num::NonZeroUsize; use std::panic::AssertUnwindSafe; +use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; use pageserver_api::shard::TenantShardId; -use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; use tokio_util::sync::CancellationToken; @@ -52,9 +49,10 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; +use utils::env; use utils::id::TimelineId; -use crate::shutdown_pageserver; +use crate::metrics::set_tokio_runtime_setup; // // There are four runtimes: @@ -104,52 +102,119 @@ use crate::shutdown_pageserver; // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't // happen, but still. // -pub static COMPUTE_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("compute request worker") - .enable_all() - .build() - .expect("Failed to create compute request runtime") -}); -pub static MGMT_REQUEST_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("mgmt request worker") - .enable_all() - .build() - .expect("Failed to create mgmt request runtime") -}); - -pub static WALRECEIVER_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("walreceiver worker") - .enable_all() - .build() - .expect("Failed to create walreceiver runtime") -}); - -pub static BACKGROUND_RUNTIME: Lazy = Lazy::new(|| { - tokio::runtime::Builder::new_multi_thread() - .thread_name("background op worker") - // if you change the number of worker threads please change the constant below - .enable_all() - .build() - .expect("Failed to create background op runtime") -}); - -pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy = Lazy::new(|| { - // force init and thus panics - let _ = BACKGROUND_RUNTIME.handle(); +pub(crate) static TOKIO_WORKER_THREADS: Lazy = Lazy::new(|| { // replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly // tokio would had already panicked for parsing errors or NotUnicode // // this will be wrong if any of the runtimes gets their worker threads configured to something // else, but that has not been needed in a long time. - std::env::var("TOKIO_WORKER_THREADS") - .map(|s| s.parse::().unwrap()) - .unwrap_or_else(|_e| usize::max(2, num_cpus::get())) + NonZeroUsize::new( + std::env::var("TOKIO_WORKER_THREADS") + .map(|s| s.parse::().unwrap()) + .unwrap_or_else(|_e| usize::max(2, num_cpus::get())), + ) + .expect("the max() ensures that this is not zero") }); +enum TokioRuntimeMode { + SingleThreaded, + MultiThreaded { num_workers: NonZeroUsize }, +} + +impl FromStr for TokioRuntimeMode { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "current_thread" => Ok(TokioRuntimeMode::SingleThreaded), + s => match s.strip_prefix("multi_thread:") { + Some("default") => Ok(TokioRuntimeMode::MultiThreaded { + num_workers: *TOKIO_WORKER_THREADS, + }), + Some(suffix) => { + let num_workers = suffix.parse::().map_err(|e| { + format!( + "invalid number of multi-threaded runtime workers ({suffix:?}): {e}", + ) + })?; + Ok(TokioRuntimeMode::MultiThreaded { num_workers }) + } + None => Err(format!("invalid runtime config: {s:?}")), + }, + } + } +} + +static ONE_RUNTIME: Lazy> = Lazy::new(|| { + let thread_name = "pageserver-tokio"; + let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { + // If the env var is not set, leave this static as None. + set_tokio_runtime_setup( + "multiple-runtimes", + NUM_MULTIPLE_RUNTIMES + .checked_mul(*TOKIO_WORKER_THREADS) + .unwrap(), + ); + return None; + }; + Some(match mode { + TokioRuntimeMode::SingleThreaded => { + set_tokio_runtime_setup("one-runtime-single-threaded", NonZeroUsize::new(1).unwrap()); + tokio::runtime::Builder::new_current_thread() + .thread_name(thread_name) + .enable_all() + .build() + .expect("failed to create one single runtime") + } + TokioRuntimeMode::MultiThreaded { num_workers } => { + set_tokio_runtime_setup("one-runtime-multi-threaded", num_workers); + tokio::runtime::Builder::new_multi_thread() + .thread_name(thread_name) + .enable_all() + .worker_threads(num_workers.get()) + .build() + .expect("failed to create one multi-threaded runtime") + } + }) +}); + +/// Declare a lazy static variable named `$varname` that will resolve +/// to a tokio runtime handle. If the env var `NEON_PAGESERVER_USE_ONE_RUNTIME` +/// is set, this will resolve to `ONE_RUNTIME`. Otherwise, the macro invocation +/// declares a separate runtime and the lazy static variable `$varname` +/// will resolve to that separate runtime. +/// +/// The result is is that `$varname.spawn()` will use `ONE_RUNTIME` if +/// `NEON_PAGESERVER_USE_ONE_RUNTIME` is set, and will use the separate runtime +/// otherwise. +macro_rules! pageserver_runtime { + ($varname:ident, $name:literal) => { + pub static $varname: Lazy<&'static tokio::runtime::Runtime> = Lazy::new(|| { + if let Some(runtime) = &*ONE_RUNTIME { + return runtime; + } + static RUNTIME: Lazy = Lazy::new(|| { + tokio::runtime::Builder::new_multi_thread() + .thread_name($name) + .worker_threads(TOKIO_WORKER_THREADS.get()) + .enable_all() + .build() + .expect(std::concat!("Failed to create runtime ", $name)) + }); + &*RUNTIME + }); + }; +} + +pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker"); +pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker"); +pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker"); +pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker"); +// Bump this number when adding a new pageserver_runtime! +// SAFETY: it's obviously correct +const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) }; + #[derive(Debug, Clone, Copy)] pub struct PageserverTaskId(u64); @@ -192,6 +257,7 @@ task_local! { serde::Serialize, serde::Deserialize, strum_macros::IntoStaticStr, + strum_macros::EnumString, )] pub enum TaskKind { // Pageserver startup, i.e., `main` @@ -219,13 +285,12 @@ pub enum TaskKind { /// Internally, `Client` hands over requests to the `Connection` object. /// The `Connection` object is responsible for speaking the wire protocol. /// - /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. - /// That abstraction doesn't use `task_mgr`. + /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task. /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind. /// - /// Once the connection is established, the `TaskHandle` task creates a - /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling + /// Once the connection is established, the `TaskHandle` task spawns a + /// [`WalReceiverConnectionPoller`] task that is responsible for polling /// the `Connection` object. /// A `CancellationToken` created by the `TaskHandle` task ensures /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped. @@ -235,7 +300,6 @@ pub enum TaskKind { WalReceiverManager, /// The `TaskHandle` task that executes `handle_walreceiver_connection`. - /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`. /// See the comment on [`WalReceiverManager`]. /// /// [`WalReceiverManager`]: Self::WalReceiverManager @@ -255,6 +319,9 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, + // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure) + IngestHousekeeping, + /// See [`crate::disk_usage_eviction_task`]. DiskUsageEviction, @@ -275,9 +342,6 @@ pub enum TaskKind { // Task that uploads a file to remote storage RemoteUploadTask, - // Task that downloads a file from remote storage - RemoteDownloadTask, - // task that handles the initial downloading of all tenants InitialLoad, @@ -300,8 +364,14 @@ pub enum TaskKind { DebugTool, + EphemeralFilePreWarmPageCache, + + LayerDownload, + #[cfg(test)] UnitTest, + + DetachAncestor, } #[derive(Default)] @@ -312,7 +382,6 @@ struct MutableTaskState { } struct PageServerTask { - #[allow(dead_code)] // unused currently task_id: PageserverTaskId, kind: TaskKind, @@ -460,7 +529,7 @@ async fn task_finish( } if shutdown_process { - shutdown_pageserver(None, 1).await; + std::process::exit(1); } } @@ -576,8 +645,8 @@ pub fn shutdown_token() -> CancellationToken { /// Has the current task been requested to shut down? pub fn is_shutdown_requested() -> bool { - if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) { - cancel.is_cancelled() + if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) { + true_or_false } else { if !cfg!(test) { warn!("is_shutdown_requested() called in an unexpected task or thread"); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7c609452e5..ca5765c99b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,20 +12,28 @@ //! use anyhow::{bail, Context}; -use camino::{Utf8Path, Utf8PathBuf}; +use arc_swap::ArcSwap; +use camino::Utf8Path; +use camino::Utf8PathBuf; use enumset::EnumSet; use futures::stream::FuturesUnordered; use futures::FutureExt; use futures::StreamExt; +use pageserver_api::models; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::TimelineState; +use pageserver_api::models::TopTenantShardItem; +use pageserver_api::models::WalRedoManagerStatus; use pageserver_api::shard::ShardIdentity; +use pageserver_api::shard::ShardStripeSize; use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; +use remote_storage::TimeoutOrCancel; use std::fmt; +use std::time::SystemTime; use storage_broker::BrokerClientChannel; use tokio::io::BufReader; -use tokio::runtime::Handle; use tokio::sync::watch; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -35,26 +43,30 @@ use utils::completion; use utils::crashsafe::path_with_suffix_extension; use utils::failpoint_support; use utils::fs_ext; +use utils::pausable_failpoint; use utils::sync::gate::Gate; use utils::sync::gate::GateGuard; use utils::timeout::timeout_cancellable; use utils::timeout::TimeoutCancellableError; +use utils::zstd::create_zst_tarball; +use utils::zstd::extract_zst_tarball; use self::config::AttachedLocationConfig; use self::config::AttachmentMode; use self::config::LocationConf; use self::config::TenantConf; use self::delete::DeleteTenantFlow; -use self::metadata::LoadMetadataError; use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; use self::mgr::TenantsMap; +use self::remote_timeline_client::upload::upload_index_part; use self::remote_timeline_client::RemoteTimelineClient; +use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; -use self::timeline::uninit::TimelineUninitMark; use self::timeline::uninit::UninitializedTimeline; use self::timeline::EvictionTaskTenantState; +use self::timeline::GcCutoffs; use self::timeline::TimelineResources; use self::timeline::WaitLsnError; use crate::config::PageServerConf; @@ -64,20 +76,21 @@ use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; use crate::metrics::TENANT; -use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC}; +use crate::metrics::{ + remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, +}; use crate::repository::GcResult; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationMode; use crate::tenant::config::TenantConfOpt; -use crate::tenant::metadata::load_metadata; pub use crate::tenant::remote_timeline_client::index::IndexPart; +use crate::tenant::remote_timeline_client::remote_initdb_archive_path; use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::InitializationOrder; -use std::cmp::min; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; @@ -86,15 +99,14 @@ use std::fmt::Debug; use std::fmt::Display; use std::fs; use std::fs::File; -use std::io; use std::ops::Bound::Included; -use std::process::Stdio; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; -use std::sync::{Mutex, RwLock}; +use std::sync::Mutex; use std::time::{Duration, Instant}; +use crate::span; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; @@ -105,43 +117,22 @@ pub use pageserver_api::models::TenantState; use tokio::sync::Semaphore; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); -use toml_edit; use utils::{ crashsafe, generation::Generation, - id::{TenantId, TimelineId}, + id::TimelineId, lsn::{Lsn, RecordLsn}, }; -/// Declare a failpoint that can use the `pause` failpoint action. -/// We don't want to block the executor thread, hence, spawn_blocking + await. -macro_rules! pausable_failpoint { - ($name:literal) => { - if cfg!(feature = "testing") { - tokio::task::spawn_blocking({ - let current = tracing::Span::current(); - move || { - let _entered = current.entered(); - tracing::info!("at failpoint {}", $name); - fail::fail_point!($name); - } - }) - .await - .expect("spawn_blocking"); - } - }; -} - pub mod blob_io; pub mod block_io; +pub mod vectored_blob_io; pub mod disk_btree; pub(crate) mod ephemeral_file; pub mod layer_map; -mod span; pub mod metadata; -mod par_fsync; pub mod remote_timeline_client; pub mod storage_layer; @@ -156,11 +147,10 @@ pub(crate) mod timeline; pub mod size; -pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; -pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; +pub(crate) mod throttle; -// re-export for use in remote_timeline_client.rs -pub use crate::tenant::metadata::save_metadata; +pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; // re-export for use in walreceiver pub use crate::tenant::timeline::WalReceiverInfo; @@ -178,7 +168,7 @@ pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted"; #[derive(Clone)] pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, - pub remote_storage: Option, + pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, } @@ -191,11 +181,18 @@ pub(super) struct AttachedTenantConf { } impl AttachedTenantConf { + fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self { + Self { + tenant_conf, + location, + } + } + fn try_from(location_conf: LocationConf) -> anyhow::Result { match &location_conf.mode { LocationMode::Attached(attach_conf) => Ok(Self { tenant_conf: location_conf.tenant_conf, - location: attach_conf.clone(), + location: *attach_conf, }), LocationMode::Secondary(_) => { anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode") @@ -217,7 +214,11 @@ pub(crate) struct TenantPreload { /// When we spawn a tenant, there is a special mode for tenant creation that /// avoids trying to read anything from remote storage. pub(crate) enum SpawnMode { - Normal, + /// Activate as soon as possible + Eager, + /// Lazy activation in the background, with the option to skip the queue if the need comes up + Lazy, + /// Tenant has been created during the lifetime of this process Create, } @@ -238,7 +239,7 @@ pub struct Tenant { // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. // This is necessary to allow global config updates. - tenant_conf: Arc>, + tenant_conf: Arc>, tenant_shard_id: TenantShardId, @@ -266,10 +267,10 @@ pub struct Tenant { // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn // timeout... gc_cs: tokio::sync::Mutex<()>, - walredo_mgr: Arc, + walredo_mgr: Option>, // provides access to timeline data sitting in the remote storage - pub(crate) remote_storage: Option, + pub(crate) remote_storage: GenericRemoteStorage, // Access to global deletion queue for when this tenant wants to schedule a deletion deletion_queue_client: DeletionQueueClient, @@ -294,6 +295,14 @@ pub struct Tenant { // Users of the Tenant such as the page service must take this Gate to avoid // trying to use a Tenant which is shutting down. pub(crate) gate: Gate, + + /// Throttle applied at the top of [`Timeline::get`]. + /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. + pub(crate) timeline_get_throttle: + Arc>, + + /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. + ongoing_timeline_detach: std::sync::Mutex>, } impl std::fmt::Debug for Tenant { @@ -355,19 +364,27 @@ impl WalRedoManager { } } } + + pub(crate) fn status(&self) -> Option { + match self { + WalRedoManager::Prod(m) => Some(m.status()), + #[cfg(test)] + WalRedoManager::Test(_) => None, + } + } } #[derive(Debug, thiserror::Error, PartialEq, Eq)] pub enum GetTimelineError { #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")] NotActive { - tenant_id: TenantId, + tenant_id: TenantShardId, timeline_id: TimelineId, state: TimelineState, }, #[error("Timeline {tenant_id}/{timeline_id} was not found")] NotFound { - tenant_id: TenantId, + tenant_id: TenantShardId, timeline_id: TimelineId, }, } @@ -466,16 +483,51 @@ impl From for InitdbError { } } -struct TenantDirectoryScan { - sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>, - timelines_to_resume_deletion: Vec<(TimelineId, Option)>, -} - enum CreateTimelineCause { Load, Delete, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum GcError { + // The tenant is shutting down + #[error("tenant shutting down")] + TenantCancelled, + + // The tenant is shutting down + #[error("timeline shutting down")] + TimelineCancelled, + + // The tenant is in a state inelegible to run GC + #[error("not active")] + NotActive, + + // A requested GC cutoff LSN was invalid, for example it tried to move backwards + #[error("not active")] + BadLsn { why: String }, + + // A remote storage error while scheduling updates after compaction + #[error(transparent)] + Remote(anyhow::Error), + + // An error reading while calculating GC cutoffs + #[error(transparent)] + GcCutoffs(PageReconstructError), + + // If GC was invoked for a particular timeline, this error means it didn't exist + #[error("timeline not found")] + TimelineNotFound, +} + +impl From for GcError { + fn from(value: PageReconstructError) -> Self { + match value { + PageReconstructError::Cancelled => Self::TimelineCancelled, + other => Self::GcCutoffs(other), + } + } +} + impl Tenant { /// Yet another helper for timeline initialization. /// @@ -494,6 +546,7 @@ impl Tenant { index_part: Option, metadata: TimelineMetadata, ancestor: Option>, + last_aux_file_policy: Option, _ctx: &RequestContext, ) -> anyhow::Result<()> { let tenant_id = self.tenant_shard_id; @@ -504,6 +557,10 @@ impl Tenant { ancestor.clone(), resources, CreateTimelineCause::Load, + // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`, + // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence. + // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2. + last_aux_file_policy, )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -517,20 +574,26 @@ impl Tenant { ); if let Some(index_part) = index_part.as_ref() { + timeline.remote_client.init_upload_queue(index_part)?; + timeline - .remote_client - .as_ref() - .unwrap() - .init_upload_queue(index_part)?; - } else if self.remote_storage.is_some() { + .last_aux_file_policy + .store(index_part.last_aux_file_policy()); + } else { // No data on the remote storage, but we have local metadata file. We can end up // here with timeline_create being interrupted before finishing index part upload. // By doing what we do here, the index part upload is retried. // If control plane retries timeline creation in the meantime, the mgmt API handler // for timeline creation will coalesce on the upload we queue here. - let rtc = timeline.remote_client.as_ref().unwrap(); - rtc.init_upload_queue_for_empty_remote(&metadata)?; - rtc.schedule_index_upload_for_metadata_update(&metadata)?; + + // FIXME: this branch should be dead code as we no longer write local metadata. + + timeline + .remote_client + .init_upload_queue_for_empty_remote(&metadata)?; + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; } timeline @@ -544,9 +607,8 @@ impl Tenant { // avoiding holding it across awaits let mut timelines_accessor = self.timelines.lock().unwrap(); match timelines_accessor.entry(timeline_id) { + // We should never try and load the same timeline twice during startup Entry::Occupied(_) => { - // The uninit mark file acts as a lock that prevents another task from - // initializing the timeline at the same time. unreachable!( "Timeline {tenant_id}/{timeline_id} already exists in the tenant map" ); @@ -607,20 +669,29 @@ impl Tenant { deletion_queue_client, } = resources; + let attach_mode = attached_conf.location.attach_mode; + let generation = attached_conf.location.generation; + let tenant = Arc::new(Tenant::new( TenantState::Attaching, conf, attached_conf, shard_identity, - wal_redo_manager, + Some(wal_redo_manager), tenant_shard_id, remote_storage.clone(), deletion_queue_client, )); + // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if + // we shut down while attaching. + let attach_gate_guard = tenant + .gate + .enter() + .expect("We just created the Tenant: nothing else can have shut it down yet"); + // Do all the hard work in the background let tenant_clone = Arc::clone(&tenant); - let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn); task_mgr::spawn( &tokio::runtime::Handle::current(), @@ -630,6 +701,14 @@ impl Tenant { "attach tenant", false, async move { + + info!( + ?attach_mode, + "Attaching tenant" + ); + + let _gate_guard = attach_gate_guard; + // Is this tenant being spawned as part of process startup? let starting_up = init_order.is_some(); scopeguard::defer! { @@ -639,9 +718,20 @@ impl Tenant { } // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. + enum BrokenVerbosity { + Error, + Info + } let make_broken = - |t: &Tenant, err: anyhow::Error| { - error!("attach failed, setting tenant state to Broken: {err:?}"); + |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| { + match verbosity { + BrokenVerbosity::Info => { + info!("attach cancelled, setting tenant state to Broken: {err}"); + }, + BrokenVerbosity::Error => { + error!("attach failed, setting tenant state to Broken: {err:?}"); + } + } t.state.send_modify(|state| { // The Stopping case is for when we have passed control on to DeleteTenantFlow: // if it errors, we will call make_broken when tenant is already in Stopping. @@ -665,84 +755,74 @@ impl Tenant { .and_then(|x| x.initial_tenant_load_remote.take()); enum AttachType<'a> { - // During pageserver startup, we are attaching this tenant lazily in the background - Warmup(tokio::sync::SemaphorePermit<'a>), - // During pageserver startup, we are attaching this tenant as soon as we can, - // because a client tried to access it. + /// We are attaching this tenant lazily in the background. + Warmup { + _permit: tokio::sync::SemaphorePermit<'a>, + during_startup: bool + }, + /// We are attaching this tenant as soon as we can, because for example an + /// endpoint tried to access it. OnDemand, - // During normal operations after startup, we are attaching a tenant. + /// During normal operations after startup, we are attaching a tenant, and + /// eager attach was requested. Normal, } - // Before doing any I/O, wait for either or: - // - A client to attempt to access to this tenant (on-demand loading) - // - A permit to become available in the warmup semaphore (background warmup) - // - // Some-ness of init_order is how we know if we're attaching during startup or later - // in process lifetime. - let attach_type = if init_order.is_some() { + let attach_type = if matches!(mode, SpawnMode::Lazy) { + // Before doing any I/O, wait for at least one of: + // - A client attempting to access to this tenant (on-demand loading) + // - A permit becoming available in the warmup semaphore (background warmup) + tokio::select!( - _ = tenant_clone.activate_now_sem.acquire() => { + permit = tenant_clone.activate_now_sem.acquire() => { + let _ = permit.expect("activate_now_sem is never closed"); tracing::info!("Activating tenant (on-demand)"); AttachType::OnDemand }, - permit_result = conf.concurrent_tenant_warmup.inner().acquire() => { - match permit_result { - Ok(p) => { - tracing::info!("Activating tenant (warmup)"); - AttachType::Warmup(p) - } - Err(_) => { - // This is unexpected: the warmup semaphore should stay alive - // for the lifetime of init_order. Log a warning and proceed. - tracing::warn!("warmup_limit semaphore unexpectedly closed"); - AttachType::Normal - } + permit = conf.concurrent_tenant_warmup.inner().acquire() => { + let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed"); + tracing::info!("Activating tenant (warmup)"); + AttachType::Warmup { + _permit, + during_startup: init_order.is_some() } - } _ = tenant_clone.cancel.cancelled() => { // This is safe, but should be pretty rare: it is interesting if a tenant // stayed in Activating for such a long time that shutdown found it in // that state. tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation"); + // Make the tenant broken so that set_stopping will not hang waiting for it to leave + // the Attaching state. This is an over-reaction (nothing really broke, the tenant is + // just shutting down), but ensures progress. + make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info); return Ok(()); }, ) } else { + // SpawnMode::{Create,Eager} always cause jumping ahead of the + // concurrent_tenant_warmup queue AttachType::Normal }; - let preload_timer = TENANT.preload.start_timer(); - let preload = match mode { + let preload = match &mode { SpawnMode::Create => { - // Don't count the skipped preload into the histogram of preload durations - preload_timer.stop_and_discard(); None }, - SpawnMode::Normal => { - match &remote_storage { - Some(remote_storage) => Some( - match tenant_clone - .preload(remote_storage, task_mgr::shutdown_token()) - .instrument( - tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()), - ) - .await { - Ok(p) => { - preload_timer.observe_duration(); - p - } - , - Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e)); - return Ok(()); - } - }, - ), - None => None, + SpawnMode::Eager | SpawnMode::Lazy => { + let _preload_timer = TENANT.preload.start_timer(); + let res = tenant_clone + .preload(&remote_storage, task_mgr::shutdown_token()) + .await; + match res { + Ok(p) => Some(p), + Err(e) => { + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); + return Ok(()); + } } } + }; // Remote preload is complete. @@ -758,7 +838,7 @@ impl Tenant { { Ok(should_resume_deletion) => should_resume_deletion, Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); + make_broken(&tenant_clone, anyhow::anyhow!(err), BrokenVerbosity::Error); return Ok(()); } } @@ -778,37 +858,38 @@ impl Tenant { info!("ready for backgound jobs barrier"); } - match DeleteTenantFlow::resume_from_attach( + let deleted = DeleteTenantFlow::resume_from_attach( deletion, &tenant_clone, preload, tenants, &ctx, ) - .await - { - Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); - return Ok(()); - } - Ok(()) => return Ok(()), + .await; + + if let Err(e) = deleted { + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); } + + return Ok(()); } // We will time the duration of the attach phase unless this is a creation (attach will do no work) - let attach_timer = match mode { - SpawnMode::Create => None, - SpawnMode::Normal => {Some(TENANT.attach.start_timer())} + let attached = { + let _attach_timer = match mode { + SpawnMode::Create => None, + SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()), + }; + tenant_clone.attach(preload, mode, &ctx).await }; - match tenant_clone.attach(preload, &ctx).await { + + match attached { Ok(()) => { info!("attach finished, activating"); - if let Some(t)= attach_timer {t.observe_duration();} tenant_clone.activate(broker_client, None, &ctx); } Err(e) => { - if let Some(t)= attach_timer {t.observe_duration();} - make_broken(&tenant_clone, anyhow::anyhow!(e)); + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); } } @@ -819,35 +900,27 @@ impl Tenant { // It also prevents the warmup proccess competing with the concurrency limit on // logical size calculations: if logical size calculation semaphore is saturated, // then warmup will wait for that before proceeding to the next tenant. - if let AttachType::Warmup(_permit) = attach_type { - let mut futs = FuturesUnordered::new(); - let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect(); - for t in timelines { - futs.push(t.await_initial_logical_size()) - } + if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) { + let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect(); tracing::info!("Waiting for initial logical sizes while warming up..."); - while futs.next().await.is_some() { - - } + while futs.next().await.is_some() {} tracing::info!("Warm-up complete"); } Ok(()) } - .instrument({ - let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); - span.follows_from(Span::current()); - span - }), + .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)), ); Ok(tenant) } + #[instrument(skip_all)] pub(crate) async fn preload( - self: &Arc, + self: &Arc, remote_storage: &GenericRemoteStorage, cancel: CancellationToken, ) -> anyhow::Result { + span::debug_assert_current_span_has_tenant_id(); // Get list of remote timelines // download index files for every tenant timeline info!("listing remote timelines"); @@ -873,9 +946,13 @@ impl Tenant { Ok(TenantPreload { deleting, - timelines: self - .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel) - .await?, + timelines: Self::load_timeline_metadata( + self, + remote_timeline_ids, + remote_storage, + cancel, + ) + .await?, }) } @@ -887,18 +964,21 @@ impl Tenant { async fn attach( self: &Arc, preload: Option, + mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); failpoint_support::sleep_millis_async!("before-attaching-tenant"); - let preload = match preload { - Some(p) => p, - None => { - // Deprecated dev mode: load from local disk state instead of remote storage - // https://github.com/neondatabase/neon/issues/5624 - return self.load_local(ctx).await; + let preload = match (preload, mode) { + (Some(p), _) => p, + (None, SpawnMode::Create) => TenantPreload { + deleting: false, + timelines: HashMap::new(), + }, + (None, _) => { + anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); } }; @@ -966,8 +1046,8 @@ impl Tenant { index_part, remote_metadata, TimelineResources { - remote_client: Some(remote_client), - deletion_queue_client: self.deletion_queue_client.clone(), + remote_client, + timeline_get_throttle: self.timeline_get_throttle.clone(), }, ctx, ) @@ -991,9 +1071,9 @@ impl Tenant { Arc::clone(self), timeline_id, &index_part.metadata, - Some(remote_timeline_client), - self.deletion_queue_client.clone(), + remote_timeline_client, ) + .instrument(tracing::info_span!("timeline_delete", %timeline_id)) .await .context("resume_deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; @@ -1003,7 +1083,10 @@ impl Tenant { // IndexPart is the source of truth. self.clean_up_timelines(&existent_timelines)?; - failpoint_support::sleep_millis_async!("attach-before-activate"); + fail::fail_point!("attach-before-activate", |_| { + anyhow::bail!("attach-before-activate"); + }); + failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel); info!("Done"); @@ -1032,8 +1115,7 @@ impl Tenant { let entry_path = entry.path(); let purge = if crate::is_temporary(entry_path) - // TODO: uninit_mark isn't needed any more, since uninitialized timelines are already - // covered by the check that the timeline must exist in remote storage. + // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718) || is_uninit_mark(entry_path) || crate::is_delete_mark(entry_path) { @@ -1080,9 +1162,7 @@ impl Tenant { let mut size = 0; for timeline in self.list_timelines() { - if let Some(remote_client) = &timeline.remote_client { - size += remote_client.get_remote_physical_size(); - } + size += timeline.remote_client.get_remote_physical_size(); } size @@ -1117,16 +1197,7 @@ impl Tenant { None }; - // timeline loading after attach expects to find metadata file for each metadata - save_metadata( - self.conf, - &self.tenant_shard_id, - &timeline_id, - &remote_metadata, - ) - .await - .context("save_metadata") - .map_err(LoadLocalTimelineError::Load)?; + let last_aux_file_policy = index_part.last_aux_file_policy(); self.timeline_init_and_sync( timeline_id, @@ -1134,6 +1205,7 @@ impl Tenant { Some(index_part), remote_metadata, ancestor, + last_aux_file_policy, ctx, ) .await @@ -1143,12 +1215,9 @@ impl Tenant { pub fn create_broken_tenant( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, + remote_storage: GenericRemoteStorage, reason: String, ) -> Arc { - let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( - conf, - tenant_shard_id, - ))); Arc::new(Tenant::new( TenantState::Broken { reason, @@ -1159,156 +1228,13 @@ impl Tenant { // Shard identity isn't meaningful for a broken tenant: it's just a placeholder // to occupy the slot for this TenantShardId. ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count), - wal_redo_manager, - tenant_shard_id, None, + tenant_shard_id, + remote_storage, DeletionQueueClient::broken(), )) } - fn scan_and_sort_timelines_dir(self: Arc) -> anyhow::Result { - let mut timelines_to_load: HashMap = HashMap::new(); - // Note timelines_to_resume_deletion needs to be separate because it can be not sortable - // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion - // completed in non topological order (for example because parent has smaller number of layer files in it) - let mut timelines_to_resume_deletion: Vec<(TimelineId, Option)> = vec![]; - - let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id); - - for entry in timelines_dir - .read_dir_utf8() - .context("list timelines directory for tenant")? - { - let entry = entry.context("read timeline dir entry")?; - let timeline_dir = entry.path(); - - if crate::is_temporary(timeline_dir) { - info!("Found temporary timeline directory, removing: {timeline_dir}"); - if let Err(e) = std::fs::remove_dir_all(timeline_dir) { - error!("Failed to remove temporary directory '{timeline_dir}': {e:?}"); - } - } else if is_uninit_mark(timeline_dir) { - if !timeline_dir.exists() { - warn!("Timeline dir entry become invalid: {timeline_dir}"); - continue; - } - - let timeline_uninit_mark_file = &timeline_dir; - info!( - "Found an uninit mark file {timeline_uninit_mark_file}, removing the timeline and its uninit mark", - ); - let timeline_id = - TimelineId::try_from(timeline_uninit_mark_file.file_stem()) - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}", - ) - })?; - let timeline_dir = self.conf.timeline_path(&self.tenant_shard_id, &timeline_id); - if let Err(e) = - remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - } else if crate::is_delete_mark(timeline_dir) { - // If metadata exists, load as usual, continue deletion - let timeline_id = TimelineId::try_from(timeline_dir.file_stem()) - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline uninit mark name {timeline_dir}", - ) - })?; - - info!("Found deletion mark for timeline {}", timeline_id); - - match load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) { - Ok(metadata) => { - timelines_to_resume_deletion.push((timeline_id, Some(metadata))) - } - Err(e) => match &e { - LoadMetadataError::Read(r) => { - if r.kind() != io::ErrorKind::NotFound { - return Err(anyhow::anyhow!(e)).with_context(|| { - format!("Failed to load metadata for timeline_id {timeline_id}") - }); - } - - // If metadata doesnt exist it means that we've crashed without - // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow. - // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`. - // We cant do it here because the method is async so we'd need block_on - // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations - // so that basically results in a cycle: - // spawn_blocking - // - block_on - // - spawn_blocking - // which can lead to running out of threads in blocing pool. - timelines_to_resume_deletion.push((timeline_id, None)); - } - _ => { - return Err(anyhow::anyhow!(e)).with_context(|| { - format!("Failed to load metadata for timeline_id {timeline_id}") - }) - } - }, - } - } else { - if !timeline_dir.exists() { - warn!("Timeline dir entry become invalid: {timeline_dir}"); - continue; - } - let timeline_id = TimelineId::try_from(timeline_dir.file_name()) - .with_context(|| { - format!( - "Could not parse timeline id out of the timeline dir name {timeline_dir}", - ) - })?; - let timeline_uninit_mark_file = self - .conf - .timeline_uninit_mark_file_path(self.tenant_shard_id, timeline_id); - if timeline_uninit_mark_file.exists() { - info!( - %timeline_id, - "Found an uninit mark file, removing the timeline and its uninit mark", - ); - if let Err(e) = - remove_timeline_and_uninit_mark(timeline_dir, &timeline_uninit_mark_file) - { - error!("Failed to clean up uninit marked timeline: {e:?}"); - } - continue; - } - - let timeline_delete_mark_file = self - .conf - .timeline_delete_mark_file_path(self.tenant_shard_id, timeline_id); - if timeline_delete_mark_file.exists() { - // Cleanup should be done in `is_delete_mark` branch above - continue; - } - - let file_name = entry.file_name(); - if let Ok(timeline_id) = file_name.parse::() { - let metadata = load_metadata(self.conf, &self.tenant_shard_id, &timeline_id) - .context("failed to load metadata")?; - timelines_to_load.insert(timeline_id, metadata); - } else { - // A file or directory that doesn't look like a timeline ID - warn!("unexpected file or directory in timelines directory: {file_name}"); - } - } - } - - // Sort the array of timeline IDs into tree-order, so that parent comes before - // all its children. - tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| { - TenantDirectoryScan { - sorted_timelines_to_load: sorted_timelines, - timelines_to_resume_deletion, - } - }) - } - async fn load_timeline_metadata( self: &Arc, timeline_ids: HashSet, @@ -1330,7 +1256,7 @@ impl Tenant { async move { debug!("starting index part download"); - let index_part = client.download_index_file(cancel_clone).await; + let index_part = client.download_index_file(&cancel_clone).await; debug!("finished index part download"); @@ -1372,145 +1298,6 @@ impl Tenant { Ok(timeline_preloads) } - /// - /// Background task to load in-memory data structures for this tenant, from - /// files on disk. Used at pageserver startup. - /// - /// No background tasks are started as part of this routine. - async fn load_local(self: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { - span::debug_assert_current_span_has_tenant_id(); - - debug!("loading tenant task"); - - // Load in-memory state to reflect the local files on disk - // - // Scan the directory, peek into the metadata file of each timeline, and - // collect a list of timelines and their ancestors. - let span = info_span!("blocking"); - let cloned = Arc::clone(self); - - let scan = tokio::task::spawn_blocking(move || { - let _g = span.entered(); - cloned.scan_and_sort_timelines_dir() - }) - .await - .context("load spawn_blocking") - .and_then(|res| res)?; - - // FIXME original collect_timeline_files contained one more check: - // 1. "Timeline has no ancestor and no layer files" - - // Process loadable timelines first - for (timeline_id, local_metadata) in scan.sorted_timelines_to_load { - if let Err(e) = self - .load_local_timeline(timeline_id, local_metadata, ctx, false) - .await - { - match e { - LoadLocalTimelineError::Load(source) => { - return Err(anyhow::anyhow!(source)).with_context(|| { - format!("Failed to load local timeline: {timeline_id}") - }) - } - LoadLocalTimelineError::ResumeDeletion(source) => { - // Make sure resumed deletion wont fail loading for entire tenant. - error!("Failed to resume timeline deletion: {source:#}") - } - } - } - } - - // Resume deletion ones with deleted_mark - for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion { - match maybe_local_metadata { - None => { - // See comment in `scan_and_sort_timelines_dir`. - if let Err(e) = - DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id) - .await - { - warn!( - "cannot clean up deleted timeline dir timeline_id: {} error: {:#}", - timeline_id, e - ); - } - } - Some(local_metadata) => { - if let Err(e) = self - .load_local_timeline(timeline_id, local_metadata, ctx, true) - .await - { - match e { - LoadLocalTimelineError::Load(source) => { - // We tried to load deleted timeline, this is a bug. - return Err(anyhow::anyhow!(source).context( - format!("This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}") - )); - } - LoadLocalTimelineError::ResumeDeletion(source) => { - // Make sure resumed deletion wont fail loading for entire tenant. - error!("Failed to resume timeline deletion: {source:#}") - } - } - } - } - } - } - - trace!("Done"); - - Ok(()) - } - - /// Subroutine of `load_tenant`, to load an individual timeline - /// - /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, local_metadata, ctx))] - async fn load_local_timeline( - self: &Arc, - timeline_id: TimelineId, - local_metadata: TimelineMetadata, - ctx: &RequestContext, - found_delete_mark: bool, - ) -> Result<(), LoadLocalTimelineError> { - span::debug_assert_current_span_has_tenant_id(); - - let resources = self.build_timeline_resources(timeline_id); - - if found_delete_mark { - // There is no remote client, we found local metadata. - // Continue cleaning up local disk. - DeleteTimelineFlow::resume_deletion( - Arc::clone(self), - timeline_id, - &local_metadata, - None, - self.deletion_queue_client.clone(), - ) - .await - .context("resume deletion") - .map_err(LoadLocalTimelineError::ResumeDeletion)?; - return Ok(()); - } - - let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { - let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) - .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}")) - .map_err(LoadLocalTimelineError::Load)?; - Some(ancestor_timeline) - } else { - None - }; - - self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx) - .await - .map_err(LoadLocalTimelineError::Load) - } - - pub(crate) fn tenant_id(&self) -> TenantId { - self.tenant_shard_id.tenant_id - } - pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } @@ -1526,13 +1313,13 @@ impl Tenant { let timeline = timelines_accessor .get(&timeline_id) .ok_or(GetTimelineError::NotFound { - tenant_id: self.tenant_shard_id.tenant_id, + tenant_id: self.tenant_shard_id, timeline_id, })?; if active_only && !timeline.is_active() { Err(GetTimelineError::NotActive { - tenant_id: self.tenant_shard_id.tenant_id, + tenant_id: self.tenant_shard_id, timeline_id, state: timeline.current_state(), }) @@ -1563,11 +1350,6 @@ impl Tenant { /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0)) /// and the timeline will fail to load at a restart. /// - /// That's why we add an uninit mark file, and wrap it together witht the Timeline - /// in-memory object into UninitializedTimeline. - /// Once the caller is done setting up the timeline, they should call - /// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark. - /// /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the /// minimum amount of keys required to get a writable timeline. /// (Without it, `put` might fail due to `repartition` failing.) @@ -1583,7 +1365,9 @@ impl Tenant { "Cannot create empty timelines on inactive tenant" ); - let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?; + // Protect against concurrent attempts to use this TimelineId + let create_guard = self.create_timeline_create_guard(new_timeline_id)?; + let new_metadata = TimelineMetadata::new( // Initialize disk_consistent LSN to 0, The caller must import some data to // make it valid, before calling finish_creation() @@ -1598,9 +1382,10 @@ impl Tenant { self.prepare_new_timeline( new_timeline_id, &new_metadata, - timeline_uninit_mark, + create_guard, initdb_lsn, None, + None, ) .await } @@ -1639,13 +1424,7 @@ impl Tenant { tline.freeze_and_flush().await.context("freeze_and_flush")?; // Make sure the freeze_and_flush reaches remote storage. - tline - .remote_client - .as_ref() - .unwrap() - .wait_completion() - .await - .unwrap(); + tline.remote_client.wait_completion().await.unwrap(); let tl = uninit_tl.finish_creation()?; // The non-test code would call tl.activate() here. @@ -1653,6 +1432,36 @@ impl Tenant { Ok(tl) } + /// Helper for unit tests to create a timeline with some pre-loaded states. + #[cfg(test)] + #[allow(clippy::too_many_arguments)] + pub async fn create_test_timeline_with_layers( + &self, + new_timeline_id: TimelineId, + initdb_lsn: Lsn, + pg_version: u32, + ctx: &RequestContext, + delta_layer_desc: Vec>, + image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, + end_lsn: Lsn, + ) -> anyhow::Result> { + let tline = self + .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx) + .await?; + tline.force_advance_lsn(end_lsn); + for deltas in delta_layer_desc { + tline + .force_create_delta_layer(deltas, Some(initdb_lsn), ctx) + .await?; + } + for (lsn, images) in image_layer_desc { + tline + .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx) + .await?; + } + Ok(tline) + } + /// Create a new timeline. /// /// Returns the new timeline ID and reference to its Timeline object. @@ -1661,7 +1470,7 @@ impl Tenant { /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists. #[allow(clippy::too_many_arguments)] pub(crate) async fn create_timeline( - &self, + self: &Arc, new_timeline_id: TimelineId, ancestor_timeline_id: Option, mut ancestor_start_lsn: Option, @@ -1671,9 +1480,13 @@ impl Tenant { ctx: &RequestContext, ) -> Result, CreateTimelineError> { if !self.is_active() { - return Err(CreateTimelineError::Other(anyhow::anyhow!( - "Cannot create timelines on inactive tenant" - ))); + if matches!(self.current_state(), TenantState::Stopping { .. }) { + return Err(CreateTimelineError::ShuttingDown); + } else { + return Err(CreateTimelineError::Other(anyhow::anyhow!( + "Cannot create timelines on inactive tenant" + ))); + } } let _gate = self @@ -1682,9 +1495,8 @@ impl Tenant { .map_err(|_| CreateTimelineError::ShuttingDown)?; // Get exclusive access to the timeline ID: this ensures that it does not already exist, - // and that no other creation attempts will be allowed in while we are working. The - // uninit_mark is a guard. - let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) { + // and that no other creation attempts will be allowed in while we are working. + let create_guard = match self.create_timeline_create_guard(new_timeline_id) { Ok(m) => m, Err(TimelineExclusionError::AlreadyCreating) => { // Creation is in progress, we cannot create it again, and we cannot @@ -1708,25 +1520,26 @@ impl Tenant { return Err(CreateTimelineError::Conflict); } - if let Some(remote_client) = existing.remote_client.as_ref() { - // Wait for uploads to complete, so that when we return Ok, the timeline - // is known to be durable on remote storage. Just like we do at the end of - // this function, after we have created the timeline ourselves. - // - // We only really care that the initial version of `index_part.json` has - // been uploaded. That's enough to remember that the timeline - // exists. However, there is no function to wait specifically for that so - // we just wait for all in-progress uploads to finish. - remote_client - .wait_completion() - .await - .context("wait for timeline uploads to complete")?; - } + // Wait for uploads to complete, so that when we return Ok, the timeline + // is known to be durable on remote storage. Just like we do at the end of + // this function, after we have created the timeline ourselves. + // + // We only really care that the initial version of `index_part.json` has + // been uploaded. That's enough to remember that the timeline + // exists. However, there is no function to wait specifically for that so + // we just wait for all in-progress uploads to finish. + existing + .remote_client + .wait_completion() + .await + .context("wait for timeline uploads to complete")?; return Ok(existing); } }; + pausable_failpoint!("timeline-creation-after-uninit"); + let loaded_timeline = match ancestor_timeline_id { Some(ancestor_timeline_id) => { let ancestor_timeline = self @@ -1760,10 +1573,10 @@ impl Tenant { // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. ancestor_timeline - .wait_lsn(*lsn, ctx) + .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx) .await .map_err(|e| match e { - e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => { + e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => { CreateTimelineError::AncestorLsn(anyhow::anyhow!(e)) } WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown, @@ -1774,7 +1587,7 @@ impl Tenant { &ancestor_timeline, new_timeline_id, ancestor_start_lsn, - uninit_mark, + create_guard, ctx, ) .await? @@ -1784,7 +1597,7 @@ impl Tenant { new_timeline_id, pg_version, load_existing_initdb, - uninit_mark, + create_guard, ctx, ) .await? @@ -1795,16 +1608,16 @@ impl Tenant { // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must // not send a success to the caller until it is. The same applies to handling retries, // see the handling of [`TimelineExclusionError::AlreadyExists`] above. - if let Some(remote_client) = loaded_timeline.remote_client.as_ref() { - let kind = ancestor_timeline_id - .map(|_| "branched") - .unwrap_or("bootstrapped"); - remote_client.wait_completion().await.with_context(|| { - format!("wait for {} timeline initial uploads to complete", kind) - })?; - } + let kind = ancestor_timeline_id + .map(|_| "branched") + .unwrap_or("bootstrapped"); + loaded_timeline + .remote_client + .wait_completion() + .await + .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?; - loaded_timeline.activate(broker_client, None, ctx); + loaded_timeline.activate(self.clone(), broker_client, None, ctx); Ok(loaded_timeline) } @@ -1831,27 +1644,26 @@ impl Tenant { /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever /// requires more history to be retained. // - pub async fn gc_iteration( + pub(crate) async fn gc_iteration( &self, target_timeline_id: Option, horizon: u64, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { // Don't start doing work during shutdown if let TenantState::Stopping { .. } = self.current_state() { return Ok(GcResult::default()); } // there is a global allowed_error for this - anyhow::ensure!( - self.is_active(), - "Cannot run GC iteration on inactive tenant" - ); + if !self.is_active() { + return Err(GcError::NotActive); + } { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() { info!("Skipping GC in location state {:?}", conf.location); @@ -1878,7 +1690,7 @@ impl Tenant { } { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); return Ok(()); @@ -1915,6 +1727,34 @@ impl Tenant { Ok(()) } + // Call through to all timelines to freeze ephemeral layers if needed. Usually + // this happens during ingest: this background housekeeping is for freezing layers + // that are open but haven't been written to for some time. + async fn ingest_housekeeping(&self) { + // Scan through the hashmap and collect a list of all the timelines, + // while holding the lock. Then drop the lock and actually perform the + // compactions. We don't want to block everything else while the + // compaction runs. + let timelines = { + self.timelines + .lock() + .unwrap() + .values() + .filter_map(|timeline| { + if timeline.is_active() { + Some(timeline.clone()) + } else { + None + } + }) + .collect::>() + }; + + for timeline in &timelines { + timeline.maybe_freeze_ephemeral_layer().await; + } + } + pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -1923,6 +1763,14 @@ impl Tenant { self.current_state() == TenantState::Active } + pub fn generation(&self) -> Generation { + self.generation + } + + pub(crate) fn wal_redo_manager_status(&self) -> Option { + self.walredo_mgr.as_ref().and_then(|mgr| mgr.status()) + } + /// Changes tenant status to active, unless shutdown was already requested. /// /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup @@ -1968,7 +1816,12 @@ impl Tenant { let mut activated_timelines = 0; for timeline in timelines_to_activate { - timeline.activate(broker_client.clone(), background_jobs_can_start, ctx); + timeline.activate( + self.clone(), + broker_client.clone(), + background_jobs_can_start, + ctx, + ); activated_timelines += 1; } @@ -2014,7 +1867,7 @@ impl Tenant { async fn shutdown( &self, shutdown_progress: completion::Barrier, - freeze_and_flush: bool, + shutdown_mode: timeline::ShutdownMode, ) -> Result<(), completion::Barrier> { span::debug_assert_current_span_has_tenant_id(); @@ -2036,6 +1889,13 @@ impl Tenant { // It's mesed up. // we just ignore the failure to stop + // If we're still attaching, fire the cancellation token early to drop out: this + // will prevent us flushing, but ensures timely shutdown if some I/O during attach + // is very slow. + if matches!(self.current_state(), TenantState::Attaching) { + self.cancel.cancel(); + } + match self.set_stopping(shutdown_progress, false, false).await { Ok(()) => {} Err(SetStoppingError::Broken) => { @@ -2053,14 +1913,9 @@ impl Tenant { let timelines = self.timelines.lock().unwrap(); timelines.values().for_each(|timeline| { let timeline = Arc::clone(timeline); - let span = Span::current(); - js.spawn(async move { - if freeze_and_flush { - timeline.flush_and_shutdown().instrument(span).await - } else { - timeline.shutdown().instrument(span).await - } - }); + let timeline_id = timeline.timeline_id; + let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode); + js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await }); }) }; // test_long_timeline_create_then_tenant_delete is leaning on this message @@ -2089,6 +1944,8 @@ impl Tenant { // Wait for any in-flight operations to complete self.gate.close().await; + remove_tenant_metrics(&self.tenant_shard_id); + Ok(()) } @@ -2288,7 +2145,12 @@ impl Tenant { TenantState::Active { .. } => { return Ok(()); } - TenantState::Broken { .. } | TenantState::Stopping { .. } => { + TenantState::Broken { reason, .. } => { + // This is fatal, and reported distinctly from the general case of "will never be active" because + // it's logically a 500 to external API users (broken is always a bug). + return Err(GetActiveTenantError::Broken(reason)); + } + TenantState::Stopping { .. } => { // There's no chance the tenant can transition back into ::Active return Err(GetActiveTenantError::WillNotBecomeActive(current_state)); } @@ -2297,21 +2159,126 @@ impl Tenant { } pub(crate) fn get_attach_mode(&self) -> AttachmentMode { - self.tenant_conf - .read() - .unwrap() - .location - .attach_mode - .clone() + self.tenant_conf.load().location.attach_mode + } + + /// For API access: generate a LocationConfig equivalent to the one that would be used to + /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively + /// rare external API calls, like a reconciliation at startup. + pub(crate) fn get_location_conf(&self) -> models::LocationConfig { + let conf = self.tenant_conf.load(); + + let location_config_mode = match conf.location.attach_mode { + AttachmentMode::Single => models::LocationConfigMode::AttachedSingle, + AttachmentMode::Multi => models::LocationConfigMode::AttachedMulti, + AttachmentMode::Stale => models::LocationConfigMode::AttachedStale, + }; + + // We have a pageserver TenantConf, we need the API-facing TenantConfig. + let tenant_config: models::TenantConfig = conf.tenant_conf.clone().into(); + + models::LocationConfig { + mode: location_config_mode, + generation: self.generation.into(), + secondary_conf: None, + shard_number: self.shard_identity.number.0, + shard_count: self.shard_identity.count.literal(), + shard_stripe_size: self.shard_identity.stripe_size.0, + tenant_conf: tenant_config, + } } pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId { &self.tenant_shard_id } + pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize { + self.shard_identity.stripe_size + } + pub(crate) fn get_generation(&self) -> Generation { self.generation } + + /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible, + /// and can leave the tenant in a bad state if it fails. The caller is responsible for + /// resetting this tenant to a valid state if we fail. + pub(crate) async fn split_prepare( + &self, + child_shards: &Vec, + ) -> anyhow::Result<()> { + let timelines = self.timelines.lock().unwrap().clone(); + for timeline in timelines.values() { + // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels + // to ensure that they do not start a split if currently in the process of doing these. + + // Upload an index from the parent: this is partly to provide freshness for the + // child tenants that will copy it, and partly for general ease-of-debugging: there will + // always be a parent shard index in the same generation as we wrote the child shard index. + timeline + .remote_client + .schedule_index_upload_for_file_changes()?; + timeline.remote_client.wait_completion().await?; + + // Shut down the timeline's remote client: this means that the indices we write + // for child shards will not be invalidated by the parent shard deleting layers. + timeline.remote_client.shutdown().await; + + // Download methods can still be used after shutdown, as they don't flow through the remote client's + // queue. In principal the RemoteTimelineClient could provide this without downloading it, but this + // operation is rare, so it's simpler to just download it (and robustly guarantees that the index + // we use here really is the remotely persistent one). + let result = timeline.remote_client + .download_index_file(&self.cancel) + .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id)) + .await?; + let index_part = match result { + MaybeDeletedIndexPart::Deleted(_) => { + anyhow::bail!("Timeline deletion happened concurrently with split") + } + MaybeDeletedIndexPart::IndexPart(p) => p, + }; + + for child_shard in child_shards { + upload_index_part( + &self.remote_storage, + child_shard, + &timeline.timeline_id, + self.generation, + &index_part, + &self.cancel, + ) + .await?; + } + } + + Ok(()) + } + + pub(crate) fn get_sizes(&self) -> TopTenantShardItem { + let mut result = TopTenantShardItem { + id: self.tenant_shard_id, + resident_size: 0, + physical_size: 0, + max_logical_size: 0, + }; + + for timeline in self.timelines.lock().unwrap().values() { + result.resident_size += timeline.metrics.resident_physical_size_gauge.get(); + + result.physical_size += timeline + .remote_client + .metrics + .remote_physical_size_gauge + .get(); + result.max_logical_size = std::cmp::max( + result.max_logical_size, + timeline.metrics.current_logical_size_gauge.get(), + ); + } + + result + } } /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id), @@ -2365,93 +2332,93 @@ where impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { - self.tenant_conf.read().unwrap().tenant_conf + self.tenant_conf.load().tenant_conf.clone() } pub fn effective_config(&self) -> TenantConf { self.tenant_specific_overrides() - .merge(self.conf.default_tenant_conf) + .merge(self.conf.default_tenant_conf.clone()) } pub fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } pub fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } pub fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } pub fn get_compaction_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_period .unwrap_or(self.conf.default_tenant_conf.compaction_period) } pub fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } pub fn get_gc_horizon(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_horizon .unwrap_or(self.conf.default_tenant_conf.gc_horizon) } pub fn get_gc_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_period .unwrap_or(self.conf.default_tenant_conf.gc_period) } pub fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } pub fn get_pitr_interval(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .pitr_interval .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } pub fn get_trace_read_requests(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .trace_read_requests .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } pub fn get_min_resident_size_override(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .min_resident_size_override .or(self.conf.default_tenant_conf.min_resident_size_override) } pub fn get_heatmap_period(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); let heatmap_period = tenant_conf .heatmap_period .unwrap_or(self.conf.default_tenant_conf.heatmap_period); @@ -2462,28 +2429,66 @@ impl Tenant { } } + pub fn get_lsn_lease_length(&self) -> Duration { + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + tenant_conf + .lsn_lease_length + .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { - self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; + // Use read-copy-update in order to avoid overwriting the location config + // state if this races with [`Tenant::set_new_location_config`]. Note that + // this race is not possible if both request types come from the storage + // controller (as they should!) because an exclusive op lock is required + // on the storage controller side. + self.tenant_conf.rcu(|inner| { + Arc::new(AttachedTenantConf { + tenant_conf: new_tenant_conf.clone(), + location: inner.location, + }) + }); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { - *self.tenant_conf.write().unwrap() = new_conf; + let new_tenant_conf = new_conf.tenant_conf.clone(); + + self.tenant_conf.store(Arc::new(new_conf)); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } + fn get_timeline_get_throttle_config( + psconf: &'static PageServerConf, + overrides: &TenantConfOpt, + ) -> throttle::Config { + overrides + .timeline_get_throttle + .clone() + .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone()) + } + + pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { + let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf); + self.timeline_get_throttle.reconfigure(conf) + } + /// Helper function to create a new Timeline struct. /// /// The returned Timeline is in Loading state. The caller is responsible for @@ -2500,6 +2505,7 @@ impl Tenant { ancestor: Option>, resources: TimelineResources, cause: CreateTimelineCause, + last_aux_file_policy: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -2524,10 +2530,11 @@ impl Tenant { self.tenant_shard_id, self.generation, self.shard_identity, - Arc::clone(&self.walredo_mgr), + self.walredo_mgr.clone(), resources, pg_version, state, + last_aux_file_policy, self.cancel.child_token(), ); @@ -2542,15 +2549,24 @@ impl Tenant { conf: &'static PageServerConf, attached_conf: AttachedTenantConf, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, tenant_shard_id: TenantShardId, - remote_storage: Option, + remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, ) -> Tenant { let (state, mut rx) = watch::channel(state); tokio::spawn(async move { + // reflect tenant state in metrics: + // - global per tenant state: TENANT_STATE_METRIC + // - "set" of broken tenants: BROKEN_TENANTS_SET + // + // set of broken tenants should not have zero counts so that it remains accessible for + // alerting. + let tid = tenant_shard_id.to_string(); + let shard_id = tenant_shard_id.shard_slug().to_string(); + let set_key = &[tid.as_str(), shard_id.as_str()][..]; fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) { ([state.into()], matches!(state, TenantState::Broken { .. })) @@ -2559,19 +2575,13 @@ impl Tenant { let mut tuple = inspect_state(&rx.borrow_and_update()); let is_broken = tuple.1; - let mut counted_broken = if !is_broken { - // the tenant might be ignored and reloaded, so first remove any previous set - // element. it most likely has already been scraped, as these are manual operations - // right now. most likely we will add it back very soon. - drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid])); - false - } else { + let mut counted_broken = if is_broken { // add the id to the set right away, there should not be any updates on the channel - // after - crate::metrics::BROKEN_TENANTS_SET - .with_label_values(&[&tid]) - .set(1); + // after before tenant is removed, if ever + BROKEN_TENANTS_SET.with_label_values(set_key).set(1); true + } else { + false }; loop { @@ -2580,10 +2590,9 @@ impl Tenant { current.inc(); if rx.changed().await.is_err() { - // tenant has been dropped; decrement the counter because a tenant with that - // state is no longer in tenant map, but allow any broken set item to exist - // still. + // tenant has been dropped current.dec(); + drop(BROKEN_TENANTS_SET.remove_label_values(set_key)); break; } @@ -2593,10 +2602,9 @@ impl Tenant { let is_broken = tuple.1; if is_broken && !counted_broken { counted_broken = true; - // insert the tenant_id (back) into the set - crate::metrics::BROKEN_TENANTS_SET - .with_label_values(&[&tid]) - .inc(); + // insert the tenant_id (back) into the set while avoiding needless counter + // access + BROKEN_TENANTS_SET.with_label_values(set_key).set(1); } } }); @@ -2609,7 +2617,6 @@ impl Tenant { // using now here is good enough approximation to catch tenants with really long // activation times. constructed_at: Instant::now(), - tenant_conf: Arc::new(RwLock::new(attached_conf)), timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), gc_cs: tokio::sync::Mutex::new(()), @@ -2623,7 +2630,13 @@ impl Tenant { activate_now_sem: tokio::sync::Semaphore::new(0), delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), cancel: CancellationToken::default(), - gate: Gate::new(format!("Tenant<{tenant_shard_id}>")), + gate: Gate::default(), + timeline_get_throttle: Arc::new(throttle::Throttle::new( + Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf), + &crate::metrics::tenant_throttling::TIMELINE_GET, + )), + tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), + ongoing_timeline_detach: std::sync::Mutex::default(), } } @@ -2655,10 +2668,11 @@ impl Tenant { } } - // Legacy configs are implicitly in attached state + // Legacy configs are implicitly in attached state, and do not support sharding Ok(LocationConf::attached_single( tenant_conf, Generation::none(), + &models::ShardParameters::default(), )) } else { // FIXME If the config file is not found, assume that we're attaching @@ -2710,19 +2724,24 @@ impl Tenant { legacy_config_path: &Utf8Path, location_conf: &LocationConf, ) -> anyhow::Result<()> { - // Forward compat: write out an old-style configuration that old versions can read, in case we roll back - Self::persist_tenant_config_legacy( - tenant_shard_id, - legacy_config_path, - &location_conf.tenant_conf, - ) - .await?; - if let LocationMode::Attached(attach_conf) = &location_conf.mode { - // Once we use LocationMode, generations are mandatory. If we aren't using generations, - // then drop out after writing legacy-style config. + // The modern-style LocationConf config file requires a generation to be set. In case someone + // is running a pageserver without the infrastructure to set generations, write out the legacy-style + // config file that only contains TenantConf. + // + // This will eventually be removed in https://github.com/neondatabase/neon/issues/5388 + if attach_conf.generation.is_none() { - tracing::debug!("Running without generations, not writing new-style LocationConf"); + tracing::info!( + "Running without generations, writing legacy-style tenant config file" + ); + Self::persist_tenant_config_legacy( + tenant_shard_id, + legacy_config_path, + &location_conf.tenant_conf, + ) + .await?; + return Ok(()); } } @@ -2734,6 +2753,10 @@ impl Tenant { "# .to_string(); + fail::fail_point!("tenant-config-before-write", |_| { + anyhow::bail!("tenant-config-before-write"); + }); + // Convert the config to a toml file. conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?; @@ -2741,17 +2764,10 @@ impl Tenant { let tenant_shard_id = *tenant_shard_id; let config_path = config_path.to_owned(); - tokio::task::spawn_blocking(move || { - Handle::current().block_on(async move { - let conf_content = conf_content.as_bytes(); - VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content) - .await - .with_context(|| { - format!("write tenant {tenant_shard_id} config to {config_path}") - }) - }) - }) - .await??; + let conf_content = conf_content.into_bytes(); + VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content) + .await + .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?; Ok(()) } @@ -2778,17 +2794,12 @@ impl Tenant { let tenant_shard_id = *tenant_shard_id; let target_config_path = target_config_path.to_owned(); - tokio::task::spawn_blocking(move || { - Handle::current().block_on(async move { - let conf_content = conf_content.as_bytes(); - VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content) - .await - .with_context(|| { - format!("write tenant {tenant_shard_id} config to {target_config_path}") - }) - }) - }) - .await??; + let conf_content = conf_content.into_bytes(); + VirtualFile::crashsafe_overwrite(target_config_path.clone(), temp_path, conf_content) + .await + .with_context(|| { + format!("write tenant {tenant_shard_id} config to {target_config_path}") + })?; Ok(()) } @@ -2824,28 +2835,13 @@ impl Tenant { pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); - let gc_timelines = match self + let gc_timelines = self .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx) - .await - { - Ok(result) => result, - Err(e) => { - if let Some(PageReconstructError::Cancelled) = - e.downcast_ref::() - { - // Handle cancellation - totals.elapsed = now.elapsed(); - return Ok(totals); - } else { - // Propagate other errors - return Err(e); - } - } - }; + .await?; failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines"); @@ -2865,12 +2861,24 @@ impl Tenant { // See comments in [`Tenant::branch_timeline`] for more information about why branch // creation task can run concurrently with timeline's GC iteration. for timeline in gc_timelines { - if task_mgr::is_shutdown_requested() || cancel.is_cancelled() { + if cancel.is_cancelled() { // We were requested to shut down. Stop and return with the progress we // made. break; } - let result = timeline.gc().await?; + let result = match timeline.gc().await { + Err(GcError::TimelineCancelled) => { + if target_timeline_id.is_some() { + // If we were targetting this specific timeline, surface cancellation to caller + return Err(GcError::TimelineCancelled); + } else { + // A timeline may be shutting down independently of the tenant's lifecycle: we should + // skip past this and proceed to try GC on other timelines. + continue; + } + } + r => r?, + }; totals += result; } @@ -2883,11 +2891,11 @@ impl Tenant { /// [`Tenant::get_gc_horizon`]. /// /// This is usually executed as part of periodic gc, but can now be triggered more often. - pub async fn refresh_gc_info( + pub(crate) async fn refresh_gc_info( &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result>> { + ) -> Result>, GcError> { // since this method can now be called at different rates than the configured gc loop, it // might be that these configuration values get applied faster than what it was previously, // since these were only read from the gc task. @@ -2908,25 +2916,58 @@ impl Tenant { pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result>> { - // grab mutex to prevent new timelines from being created here. + ) -> Result>, GcError> { + // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for + // currently visible timelines. + let timelines = self + .timelines + .lock() + .unwrap() + .values() + .filter(|tl| match target_timeline_id.as_ref() { + Some(target) => &tl.timeline_id == target, + None => true, + }) + .cloned() + .collect::>(); + + let mut gc_cutoffs: HashMap = + HashMap::with_capacity(timelines.len()); + + for timeline in timelines.iter() { + let cutoff = timeline + .get_last_record_lsn() + .checked_sub(horizon) + .unwrap_or(Lsn(0)); + + let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?; + let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs); + assert!(old.is_none()); + } + + if !self.is_active() || self.cancel.is_cancelled() { + return Err(GcError::TenantCancelled); + } + + // grab mutex to prevent new timelines from being created here; avoid doing long operations + // because that will stall branch creation. let gc_cs = self.gc_cs.lock().await; // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let (all_branchpoints, timeline_ids): (BTreeSet<(TimelineId, Lsn)>, _) = { + let (all_branchpoints, timelines): (BTreeSet<(TimelineId, Lsn)>, _) = { let timelines = self.timelines.lock().unwrap(); let mut all_branchpoints = BTreeSet::new(); - let timeline_ids = { + let timelines = { if let Some(target_timeline_id) = target_timeline_id.as_ref() { if timelines.get(target_timeline_id).is_none() { - bail!("gc target timeline does not exist") + return Err(GcError::TimelineNotFound); } }; timelines .iter() - .map(|(timeline_id, timeline_entry)| { + .map(|(_timeline_id, timeline_entry)| { if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { @@ -2948,43 +2989,55 @@ impl Tenant { } } - *timeline_id + timeline_entry.clone() }) .collect::>() }; - (all_branchpoints, timeline_ids) + (all_branchpoints, timelines) }; // Ok, we now know all the branch points. // Update the GC information for each timeline. - let mut gc_timelines = Vec::with_capacity(timeline_ids.len()); - for timeline_id in timeline_ids { - // Timeline is known to be local and loaded. - let timeline = self - .get_timeline(timeline_id, false) - .with_context(|| format!("Timeline {timeline_id} was not found"))?; - + let mut gc_timelines = Vec::with_capacity(timelines.len()); + for timeline in timelines { // If target_timeline is specified, ignore all other timelines if let Some(target_timeline_id) = target_timeline_id { - if timeline_id != target_timeline_id { + if timeline.timeline_id != target_timeline_id { continue; } } - if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) { - let branchpoints: Vec = all_branchpoints - .range(( - Included((timeline_id, Lsn(0))), - Included((timeline_id, Lsn(u64::MAX))), - )) - .map(|&x| x.1) - .collect(); - timeline - .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx) - .await?; + let branchpoints: Vec = all_branchpoints + .range(( + Included((timeline.timeline_id, Lsn(0))), + Included((timeline.timeline_id, Lsn(u64::MAX))), + )) + .map(|&x| x.1) + .collect(); - gc_timelines.push(timeline); + { + let mut target = timeline.gc_info.write().unwrap(); + + let now = SystemTime::now(); + target.leases.retain(|_, lease| !lease.is_expired(&now)); + + match gc_cutoffs.remove(&timeline.timeline_id) { + Some(cutoffs) => { + target.retain_lsns = branchpoints; + target.cutoffs = cutoffs; + } + None => { + // reasons for this being unavailable: + // - this timeline was created while we were finding cutoffs + // - lsn for timestamp search fails for this timeline repeatedly + // + // in both cases, refreshing the branchpoints is correct. + target.retain_lsns = branchpoints; + } + }; } + + gc_timelines.push(timeline); } drop(gc_cs); Ok(gc_timelines) @@ -2999,17 +3052,53 @@ impl Tenant { &self, src_timeline: &Arc, dst_id: TimelineId, - start_lsn: Option, + ancestor_lsn: Option, ctx: &RequestContext, ) -> Result, CreateTimelineError> { - let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap(); + let create_guard = self.create_timeline_create_guard(dst_id).unwrap(); let tl = self - .branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx) + .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx) .await?; tl.set_state(TimelineState::Active); Ok(tl) } + /// Helper for unit tests to branch a timeline with some pre-loaded states. + #[cfg(test)] + #[allow(clippy::too_many_arguments)] + pub async fn branch_timeline_test_with_layers( + &self, + src_timeline: &Arc, + dst_id: TimelineId, + ancestor_lsn: Option, + ctx: &RequestContext, + delta_layer_desc: Vec>, + image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, + end_lsn: Lsn, + ) -> anyhow::Result> { + let tline = self + .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx) + .await?; + let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn { + ancestor_lsn + } else { + tline.get_last_record_lsn() + }; + assert!(end_lsn >= ancestor_lsn); + tline.force_advance_lsn(end_lsn); + for deltas in delta_layer_desc { + tline + .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx) + .await?; + } + for (lsn, images) in image_layer_desc { + tline + .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx) + .await?; + } + Ok(tline) + } + /// Branch an existing timeline. /// /// The caller is responsible for activating the returned timeline. @@ -3018,10 +3107,10 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - timeline_uninit_mark: TimelineUninitMark<'_>, + timeline_create_guard: TimelineCreateGuard<'_>, ctx: &RequestContext, ) -> Result, CreateTimelineError> { - self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx) + self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx) .await } @@ -3030,7 +3119,7 @@ impl Tenant { src_timeline: &Arc, dst_id: TimelineId, start_lsn: Option, - timeline_uninit_mark: TimelineUninitMark<'_>, + timeline_create_guard: TimelineCreateGuard<'_>, _ctx: &RequestContext, ) -> Result, CreateTimelineError> { let src_id = src_timeline.timeline_id; @@ -3071,7 +3160,7 @@ impl Tenant { // and then the planned GC cutoff { let gc_info = src_timeline.gc_info.read().unwrap(); - let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff); + let cutoff = gc_info.min_cutoff(); if start_lsn < cutoff { return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!( "invalid branch start lsn: less than planned GC cutoff {cutoff}" @@ -3114,9 +3203,10 @@ impl Tenant { .prepare_new_timeline( dst_id, &metadata, - timeline_uninit_mark, + timeline_create_guard, start_lsn + 1, Some(Arc::clone(src_timeline)), + src_timeline.last_aux_file_policy.load(), ) .await?; @@ -3127,20 +3217,17 @@ impl Tenant { // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC // could get incorrect information and remove more layers, than needed. // See also https://github.com/neondatabase/neon/issues/3865 - if let Some(remote_client) = new_timeline.remote_client.as_ref() { - remote_client - .schedule_index_upload_for_metadata_update(&metadata) - .context("branch initial metadata upload")?; - } - - info!("branched timeline {dst_id} from {src_id} at {start_lsn}"); + new_timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata) + .context("branch initial metadata upload")?; Ok(new_timeline) } /// For unit tests, make this visible so that other modules can directly create timelines #[cfg(test)] - #[tracing::instrument(fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] + #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))] pub(crate) async fn bootstrap_timeline_test( &self, timeline_id: TimelineId, @@ -3148,17 +3235,66 @@ impl Tenant { load_existing_initdb: Option, ctx: &RequestContext, ) -> anyhow::Result> { - let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap(); + let create_guard = self.create_timeline_create_guard(timeline_id).unwrap(); self.bootstrap_timeline( timeline_id, pg_version, load_existing_initdb, - uninit_mark, + create_guard, ctx, ) .await } + async fn upload_initdb( + &self, + timelines_path: &Utf8PathBuf, + pgdata_path: &Utf8PathBuf, + timeline_id: &TimelineId, + ) -> anyhow::Result<()> { + let temp_path = timelines_path.join(format!( + "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}" + )); + + scopeguard::defer! { + if let Err(e) = fs::remove_file(&temp_path) { + error!("Failed to remove temporary initdb archive '{temp_path}': {e}"); + } + } + + let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?; + const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024; + if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT { + warn!( + "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}." + ); + } + + pausable_failpoint!("before-initdb-upload"); + + backoff::retry( + || async { + self::remote_timeline_client::upload_initdb_dir( + &self.remote_storage, + &self.tenant_shard_id.tenant_id, + timeline_id, + pgdata_zstd.try_clone().await?, + tar_zst_size, + &self.cancel, + ) + .await + }, + |_| false, + 3, + u32::MAX, + "persist_initdb_tar_zst", + &self.cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + } + /// - run initdb to init temporary instance and get bootstrap data /// - after initialization completes, tar up the temp dir and upload it to S3. /// @@ -3168,7 +3304,7 @@ impl Tenant { timeline_id: TimelineId, pg_version: u32, load_existing_initdb: Option, - timeline_uninit_mark: TimelineUninitMark<'_>, + timeline_create_guard: TimelineCreateGuard<'_>, ctx: &RequestContext, ) -> anyhow::Result> { // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/` @@ -3180,13 +3316,14 @@ impl Tenant { TEMP_FILE_SUFFIX, ); - // an uninit mark was placed before, nothing else can access this timeline files - // current initdb was not run yet, so remove whatever was left from the previous runs + // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees + // we won't race with other creations or existent timelines with the same path. if pgdata_path.exists() { fs::remove_dir_all(&pgdata_path).with_context(|| { format!("Failed to remove already existing initdb directory: {pgdata_path}") })?; } + // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it scopeguard::defer! { if let Err(e) = fs::remove_dir_all(&pgdata_path) { @@ -3195,79 +3332,50 @@ impl Tenant { } } if let Some(existing_initdb_timeline_id) = load_existing_initdb { - let Some(storage) = &self.remote_storage else { - bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}"); - }; + if existing_initdb_timeline_id != timeline_id { + let source_path = &remote_initdb_archive_path( + &self.tenant_shard_id.tenant_id, + &existing_initdb_timeline_id, + ); + let dest_path = + &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id); + + // if this fails, it will get retried by retried control plane requests + self.remote_storage + .copy_object(source_path, dest_path, &self.cancel) + .await + .context("copy initdb tar")?; + } let (initdb_tar_zst_path, initdb_tar_zst) = self::remote_timeline_client::download_initdb_tar_zst( self.conf, - storage, + &self.remote_storage, &self.tenant_shard_id, &existing_initdb_timeline_id, &self.cancel, ) .await .context("download initdb tar")?; + + scopeguard::defer! { + if let Err(e) = fs::remove_file(&initdb_tar_zst_path) { + error!("Failed to remove temporary initdb archive '{initdb_tar_zst_path}': {e}"); + } + } + let buf_read = BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst); - import_datadir::extract_tar_zst(&pgdata_path, buf_read) + extract_zst_tarball(&pgdata_path, buf_read) .await .context("extract initdb tar")?; - - tokio::fs::remove_file(&initdb_tar_zst_path) - .await - .or_else(|e| { - if e.kind() == std::io::ErrorKind::NotFound { - // If something else already removed the file, ignore the error - Ok(()) - } else { - Err(e) - } - }) - .with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?; } else { - // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path + // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?; // Upload the created data dir to S3 - if let Some(storage) = &self.remote_storage { - let temp_path = timelines_path.join(format!( - "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}" - )); - - let (pgdata_zstd, tar_zst_size) = - import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?; - backoff::retry( - || async { - self::remote_timeline_client::upload_initdb_dir( - storage, - &self.tenant_shard_id.tenant_id, - &timeline_id, - pgdata_zstd.try_clone().await?, - tar_zst_size, - &self.cancel, - ) - .await - }, - |_| false, - 3, - u32::MAX, - "persist_initdb_tar_zst", - backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), - ) - .await?; - - tokio::fs::remove_file(&temp_path) - .await - .or_else(|e| { - if e.kind() == std::io::ErrorKind::NotFound { - // If something else already removed the file, ignore the error - Ok(()) - } else { - Err(e) - } - }) - .with_context(|| format!("tempfile removal {temp_path}"))?; + if self.tenant_shard_id().is_shard_zero() { + self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id) + .await?; } } let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align(); @@ -3289,15 +3397,22 @@ impl Tenant { .prepare_new_timeline( timeline_id, &new_metadata, - timeline_uninit_mark, + timeline_create_guard, pgdata_lsn, None, + None, ) .await?; let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; let unfinished_timeline = raw_timeline.raw_timeline()?; + // Flush the new layer files to disk, before we make the timeline as available to + // the outside world. + // + // Flush loop needs to be spawned in order to be able to flush. + unfinished_timeline.maybe_spawn_flush_loop(); + import_datadir::import_timeline_from_postgres_datadir( unfinished_timeline, &pgdata_path, @@ -3309,12 +3424,6 @@ impl Tenant { format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}") })?; - // Flush the new layer files to disk, before we make the timeline as available to - // the outside world. - // - // Flush loop needs to be spawned in order to be able to flush. - unfinished_timeline.maybe_spawn_flush_loop(); - fail::fail_point!("before-checkpoint-new-timeline", |_| { anyhow::bail!("failpoint before-checkpoint-new-timeline"); }); @@ -3331,34 +3440,22 @@ impl Tenant { // All done! let timeline = raw_timeline.finish_creation()?; - info!( - "created root timeline {} timeline.lsn {}", - timeline_id, - timeline.get_last_record_lsn() - ); - Ok(timeline) } /// Call this before constructing a timeline, to build its required structures fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { - let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() { - let remote_client = RemoteTimelineClient::new( - remote_storage.clone(), - self.deletion_queue_client.clone(), - self.conf, - self.tenant_shard_id, - timeline_id, - self.generation, - ); - Some(remote_client) - } else { - None - }; - + let remote_client = RemoteTimelineClient::new( + self.remote_storage.clone(), + self.deletion_queue_client.clone(), + self.conf, + self.tenant_shard_id, + timeline_id, + self.generation, + ); TimelineResources { remote_client, - deletion_queue_client: self.deletion_queue_client.clone(), + timeline_get_throttle: self.timeline_get_throttle.clone(), } } @@ -3366,22 +3463,22 @@ impl Tenant { /// /// An empty layer map is initialized, and new data and WAL can be imported starting /// at 'disk_consistent_lsn'. After any initial data has been imported, call - /// `finish_creation` to insert the Timeline into the timelines map and to remove the - /// uninit mark file. + /// `finish_creation` to insert the Timeline into the timelines map. async fn prepare_new_timeline<'a>( &'a self, new_timeline_id: TimelineId, new_metadata: &TimelineMetadata, - uninit_mark: TimelineUninitMark<'a>, + create_guard: TimelineCreateGuard<'a>, start_lsn: Lsn, ancestor: Option>, + last_aux_file_policy: Option, ) -> anyhow::Result { let tenant_shard_id = self.tenant_shard_id; let resources = self.build_timeline_resources(new_timeline_id); - if let Some(remote_client) = &resources.remote_client { - remote_client.init_upload_queue_for_empty_remote(new_metadata)?; - } + resources + .remote_client + .init_upload_queue_for_empty_remote(new_metadata)?; let timeline_struct = self .create_timeline_struct( @@ -3390,17 +3487,18 @@ impl Tenant { ancestor, resources, CreateTimelineCause::Load, + last_aux_file_policy, ) .context("Failed to create timeline data structure")?; timeline_struct.init_empty_layer_map(start_lsn); if let Err(e) = self - .create_timeline_files(&uninit_mark.timeline_path, &new_timeline_id, new_metadata) + .create_timeline_files(&create_guard.timeline_path) .await { error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}"); - cleanup_timeline_directory(uninit_mark); + cleanup_timeline_directory(create_guard); return Err(e); } @@ -3411,54 +3509,31 @@ impl Tenant { Ok(UninitializedTimeline::new( self, new_timeline_id, - Some((timeline_struct, uninit_mark)), + Some((timeline_struct, create_guard)), )) } - async fn create_timeline_files( - &self, - timeline_path: &Utf8Path, - new_timeline_id: &TimelineId, - new_metadata: &TimelineMetadata, - ) -> anyhow::Result<()> { + async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> { crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?; - fail::fail_point!("after-timeline-uninit-mark-creation", |_| { - anyhow::bail!("failpoint after-timeline-uninit-mark-creation"); + fail::fail_point!("after-timeline-dir-creation", |_| { + anyhow::bail!("failpoint after-timeline-dir-creation"); }); - save_metadata( - self.conf, - &self.tenant_shard_id, - new_timeline_id, - new_metadata, - ) - .await - .context("Failed to create timeline metadata")?; Ok(()) } - /// Attempts to create an uninit mark file for the timeline initialization. - /// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists. - /// - /// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init. - fn create_timeline_uninit_mark( + /// Get a guard that provides exclusive access to the timeline directory, preventing + /// concurrent attempts to create the same timeline. + fn create_timeline_create_guard( &self, timeline_id: TimelineId, - ) -> Result { + ) -> Result { let tenant_shard_id = self.tenant_shard_id; - let uninit_mark_path = self - .conf - .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id); let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id); - let uninit_mark = TimelineUninitMark::new( - self, - timeline_id, - uninit_mark_path.clone(), - timeline_path.clone(), - )?; + let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?; // At this stage, we have got exclusive access to in-memory state for this timeline ID // for creation. @@ -3474,23 +3549,7 @@ impl Tenant { ))); } - // Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees - // that during process runtime, colliding creations will be caught in-memory without getting - // as far as failing to write a file. - fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(&uninit_mark_path) - .context("Failed to create uninit mark file") - .and_then(|_| { - crashsafe::fsync_file_and_parent(&uninit_mark_path) - .context("Failed to fsync uninit mark file") - }) - .with_context(|| { - format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}") - })?; - - Ok(uninit_mark) + Ok(create_guard) } /// Gathers inputs from all of the timelines to produce a sizing model input. @@ -3505,7 +3564,7 @@ impl Tenant { cause: LogicalSizeCalculationCause, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let logical_sizes_at_once = self .conf .concurrent_tenant_size_logical_size_queries @@ -3518,7 +3577,11 @@ impl Tenant { // is in progress (which is not a common case). // // See more for on the issue #2748 condenced out of the initial PR review. - let mut shared_cache = self.cached_logical_sizes.lock().await; + let mut shared_cache = tokio::select! { + locked = self.cached_logical_sizes.lock() => locked, + _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled), + _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled), + }; size::gather_inputs( self, @@ -3541,10 +3604,10 @@ impl Tenant { cause: LogicalSizeCalculationCause, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?; - let size = inputs.calculate()?; + let size = inputs.calculate(); self.set_cached_synthetic_size(size); @@ -3556,6 +3619,9 @@ impl Tenant { self.cached_synthetic_tenant_size .store(size, Ordering::Relaxed); + // Only shard zero should be calculating synthetic sizes + debug_assert!(self.shard_identity.is_shard_zero()); + TENANT_SYNTHETIC_SIZE_METRIC .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()]) .unwrap() @@ -3581,9 +3647,7 @@ impl Tenant { tracing::info!(timeline_id=%timeline.timeline_id, "Flushing..."); timeline.freeze_and_flush().await?; tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads..."); - if let Some(client) = &timeline.remote_client { - client.wait_completion().await?; - } + timeline.remote_client.wait_completion().await?; Ok(()) } @@ -3598,9 +3662,8 @@ impl Tenant { // Run each timeline's flush in a task holding the timeline's gate: this // means that if this function's future is cancelled, the Timeline shutdown // will still wait for any I/O in here to complete. - let gate = match timeline.gate.enter() { - Ok(g) => g, - Err(_) => continue, + let Ok(gate) = timeline.gate.enter() else { + continue; }; let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await }); results.push(jh); @@ -3625,163 +3688,10 @@ impl Tenant { Ok(()) } -} -fn remove_timeline_and_uninit_mark( - timeline_dir: &Utf8Path, - uninit_mark: &Utf8Path, -) -> anyhow::Result<()> { - fs::remove_dir_all(timeline_dir) - .or_else(|e| { - if e.kind() == std::io::ErrorKind::NotFound { - // we can leave the uninit mark without a timeline dir, - // just remove the mark then - Ok(()) - } else { - Err(e) - } - }) - .with_context(|| { - format!("Failed to remove unit marked timeline directory {timeline_dir}") - })?; - fs::remove_file(uninit_mark) - .with_context(|| format!("Failed to remove timeline uninit mark file {uninit_mark}"))?; - - Ok(()) -} - -pub(crate) async fn create_tenant_files( - conf: &'static PageServerConf, - location_conf: &LocationConf, - tenant_shard_id: &TenantShardId, -) -> anyhow::Result { - let target_tenant_directory = conf.tenant_path(tenant_shard_id); - anyhow::ensure!( - !target_tenant_directory - .try_exists() - .context("check existence of tenant directory")?, - "tenant directory already exists", - ); - - let temporary_tenant_dir = - path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX); - debug!("Creating temporary directory structure in {temporary_tenant_dir}"); - - // top-level dir may exist if we are creating it through CLI - crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| { - format!("could not create temporary tenant directory {temporary_tenant_dir}") - })?; - - let creation_result = try_create_target_tenant_dir( - conf, - location_conf, - tenant_shard_id, - &temporary_tenant_dir, - &target_tenant_directory, - ) - .await; - - if creation_result.is_err() { - error!( - "Failed to create directory structure for tenant {tenant_shard_id}, cleaning tmp data" - ); - if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) { - error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}") - } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) { - error!( - "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}" - ) - } + pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { + self.tenant_conf.load().tenant_conf.clone() } - - creation_result?; - - Ok(target_tenant_directory) -} - -async fn try_create_target_tenant_dir( - conf: &'static PageServerConf, - location_conf: &LocationConf, - tenant_shard_id: &TenantShardId, - temporary_tenant_dir: &Utf8Path, - target_tenant_directory: &Utf8Path, -) -> Result<(), anyhow::Error> { - let temporary_tenant_timelines_dir = rebase_directory( - &conf.timelines_path(tenant_shard_id), - target_tenant_directory, - temporary_tenant_dir, - ) - .with_context(|| format!("resolve tenant {tenant_shard_id} temporary timelines dir"))?; - let temporary_legacy_tenant_config_path = rebase_directory( - &conf.tenant_config_path(tenant_shard_id), - target_tenant_directory, - temporary_tenant_dir, - ) - .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?; - let temporary_tenant_config_path = rebase_directory( - &conf.tenant_location_config_path(tenant_shard_id), - target_tenant_directory, - temporary_tenant_dir, - ) - .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?; - - Tenant::persist_tenant_config_at( - tenant_shard_id, - &temporary_tenant_config_path, - &temporary_legacy_tenant_config_path, - location_conf, - ) - .await?; - - crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| { - format!( - "create tenant {} temporary timelines directory {}", - tenant_shard_id, temporary_tenant_timelines_dir, - ) - })?; - fail::fail_point!("tenant-creation-before-tmp-rename", |_| { - anyhow::bail!("failpoint tenant-creation-before-tmp-rename"); - }); - - // Make sure the current tenant directory entries are durable before renaming. - // Without this, a crash may reorder any of the directory entry creations above. - crashsafe::fsync(temporary_tenant_dir) - .with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?; - - fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| { - format!( - "move tenant {} temporary directory {} into the permanent one {}", - tenant_shard_id, temporary_tenant_dir, target_tenant_directory - ) - })?; - let target_dir_parent = target_tenant_directory.parent().with_context(|| { - format!( - "get tenant {} dir parent for {}", - tenant_shard_id, target_tenant_directory, - ) - })?; - crashsafe::fsync(target_dir_parent).with_context(|| { - format!( - "fsync renamed directory's parent {} for tenant {}", - target_dir_parent, tenant_shard_id, - ) - })?; - - Ok(()) -} - -fn rebase_directory( - original_path: &Utf8Path, - base: &Utf8Path, - new_base: &Utf8Path, -) -> anyhow::Result { - let relative_path = original_path.strip_prefix(base).with_context(|| { - format!( - "Failed to strip base prefix '{}' off path '{}'", - base, original_path - ) - })?; - Ok(new_base.join(relative_path)) } /// Create the cluster temporarily in 'initdbpath' directory inside the repository @@ -3813,37 +3723,35 @@ async fn run_initdb( .env_clear() .env("LD_LIBRARY_PATH", &initdb_lib_dir) .env("DYLD_LIBRARY_PATH", &initdb_lib_dir) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - // If the `select!` below doesn't finish the `wait_with_output`, - // let the task get `wait()`ed for asynchronously by tokio. - // This means there is a slim chance we can go over the INIT_DB_SEMAPHORE. - // TODO: fix for this is non-trivial, see - // https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021 - // - .kill_on_drop(true) + .stdin(std::process::Stdio::null()) + // stdout invocation produces the same output every time, we don't need it + .stdout(std::process::Stdio::null()) + // we would be interested in the stderr output, if there was any + .stderr(std::process::Stdio::piped()) .spawn()?; - tokio::select! { - initdb_output = initdb_command.wait_with_output() => { - let initdb_output = initdb_output?; - if !initdb_output.status.success() { - return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr)); - } - } - _ = cancel.cancelled() => { - return Err(InitdbError::Cancelled); - } + // Ideally we'd select here with the cancellation token, but the problem is that + // we can't safely terminate initdb: it launches processes of its own, and killing + // initdb doesn't kill them. After we return from this function, we want the target + // directory to be able to be cleaned up. + // See https://github.com/neondatabase/neon/issues/6385 + let initdb_output = initdb_command.wait_with_output().await?; + if !initdb_output.status.success() { + return Err(InitdbError::Failed( + initdb_output.status, + initdb_output.stderr, + )); + } + + // This isn't true cancellation support, see above. Still return an error to + // excercise the cancellation code path. + if cancel.is_cancelled() { + return Err(InitdbError::Cancelled); } Ok(()) } -impl Drop for Tenant { - fn drop(&mut self) { - remove_tenant_metrics(&self.tenant_shard_id.tenant_id); - } -} /// Dump contents of a layer file to stdout. pub async fn dump_layerfile_from_path( path: &Utf8Path, @@ -3879,21 +3787,17 @@ pub async fn dump_layerfile_from_path( pub(crate) mod harness { use bytes::{Bytes, BytesMut}; use once_cell::sync::OnceCell; + use pageserver_api::models::ShardParameters; use pageserver_api::shard::ShardIndex; - use std::fs; - use std::sync::Arc; use utils::logging; - use utils::lsn::Lsn; use crate::deletion_queue::mock::MockDeletionQueue; - use crate::{ - config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord, - }; + use crate::walredo::apply_neon; + use crate::{repository::Key, walrecord::NeonWalRecord}; use super::*; - use crate::tenant::config::{TenantConf, TenantConfOpt}; use hex_literal::hex; - use utils::id::{TenantId, TimelineId}; + use utils::id::TenantId; pub const TIMELINE_ID: TimelineId = TimelineId::from_array(hex!("11223344556677881122334455667788")); @@ -3901,8 +3805,7 @@ pub(crate) mod harness { TimelineId::from_array(hex!("AA223344556677881122334455667788")); /// Convenience function to create a page image with given string as the only content - #[allow(non_snake_case)] - pub fn TEST_IMG(s: &str) -> Bytes { + pub fn test_img(s: &str) -> Bytes { let mut buf = BytesMut::new(); buf.extend_from_slice(s.as_bytes()); buf.resize(64, 0); @@ -3918,6 +3821,7 @@ pub(crate) mod harness { compaction_target_size: Some(tenant_conf.compaction_target_size), compaction_period: Some(tenant_conf.compaction_period), compaction_threshold: Some(tenant_conf.compaction_threshold), + compaction_algorithm: Some(tenant_conf.compaction_algorithm), gc_horizon: Some(tenant_conf.gc_horizon), gc_period: Some(tenant_conf.gc_period), image_creation_threshold: Some(tenant_conf.image_creation_threshold), @@ -3931,22 +3835,22 @@ pub(crate) mod harness { evictions_low_residence_duration_metric_threshold: Some( tenant_conf.evictions_low_residence_duration_metric_threshold, ), - gc_feedback: Some(tenant_conf.gc_feedback), heatmap_period: Some(tenant_conf.heatmap_period), + lazy_slru_download: Some(tenant_conf.lazy_slru_download), + timeline_get_throttle: Some(tenant_conf.timeline_get_throttle), + image_layer_creation_check_threshold: Some( + tenant_conf.image_layer_creation_check_threshold, + ), + switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy), + lsn_lease_length: Some(tenant_conf.lsn_lease_length), + lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), } } } - enum LoadMode { - Local, - Remote, - } - pub struct TenantHarness { pub conf: &'static PageServerConf, pub tenant_conf: TenantConf, - // TODO(sharding): remove duplicative `tenant_id` in favor of access to tenant_shard_id - pub(crate) tenant_id: TenantId, pub tenant_shard_id: TenantShardId, pub generation: Generation, pub shard: ShardIndex, @@ -3971,7 +3875,13 @@ pub(crate) mod harness { } impl TenantHarness { - pub fn create(test_name: &'static str) -> anyhow::Result { + pub fn create_custom( + test_name: &'static str, + tenant_conf: TenantConf, + tenant_id: TenantId, + shard_identity: ShardIdentity, + generation: Generation, + ) -> anyhow::Result { setup_logging(); let repo_dir = PageServerConf::test_repo_dir(test_name); @@ -3983,16 +3893,12 @@ pub(crate) mod harness { // OK in a test. let conf: &'static PageServerConf = Box::leak(Box::new(conf)); - // Disable automatic GC and compaction to make the unit tests more deterministic. - // The tests perform them manually if needed. - let tenant_conf = TenantConf { - gc_period: Duration::ZERO, - compaction_period: Duration::ZERO, - ..TenantConf::default() + let shard = shard_identity.shard_index(); + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number: shard.shard_number, + shard_count: shard.shard_count, }; - - let tenant_id = TenantId::generate(); - let tenant_shard_id = TenantShardId::unsharded(tenant_id); fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?; fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?; @@ -4001,6 +3907,7 @@ pub(crate) mod harness { std::fs::create_dir_all(&remote_fs_dir).unwrap(); let config = RemoteStorageConfig { storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; let remote_storage = GenericRemoteStorage::from_config(&config).unwrap(); let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); @@ -4008,55 +3915,52 @@ pub(crate) mod harness { Ok(Self { conf, tenant_conf, - tenant_id, tenant_shard_id, - generation: Generation::new(0xdeadbeef), - shard: ShardIndex::unsharded(), + generation, + shard, remote_storage, remote_fs_dir, deletion_queue, }) } - pub async fn load(&self) -> (Arc, RequestContext) { + pub fn create(test_name: &'static str) -> anyhow::Result { + // Disable automatic GC and compaction to make the unit tests more deterministic. + // The tests perform them manually if needed. + let tenant_conf = TenantConf { + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + ..TenantConf::default() + }; + let tenant_id = TenantId::generate(); + let shard = ShardIdentity::unsharded(); + Self::create_custom( + test_name, + tenant_conf, + tenant_id, + shard, + Generation::new(0xdeadbeef), + ) + } + + pub fn span(&self) -> tracing::Span { + info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } + + pub(crate) async fn load(&self) -> (Arc, RequestContext) { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); ( - self.try_load(&ctx) + self.do_try_load(&ctx) .await .expect("failed to load test tenant"), ctx, ) } - fn remote_empty(&self) -> bool { - let tenant_path = self.conf.tenant_path(&self.tenant_shard_id); - let remote_tenant_dir = self - .remote_fs_dir - .join(tenant_path.strip_prefix(&self.conf.workdir).unwrap()); - if std::fs::metadata(&remote_tenant_dir).is_err() { - return true; - } - - match std::fs::read_dir(remote_tenant_dir) - .unwrap() - .flatten() - .next() - { - Some(entry) => { - tracing::debug!( - "remote_empty: not empty, found file {}", - entry.file_name().to_string_lossy(), - ); - false - } - None => true, - } - } - - async fn do_try_load( + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) async fn do_try_load( &self, ctx: &RequestContext, - mode: LoadMode, ) -> anyhow::Result> { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); @@ -4064,36 +3968,23 @@ pub(crate) mod harness { TenantState::Loading, self.conf, AttachedTenantConf::try_from(LocationConf::attached_single( - TenantConfOpt::from(self.tenant_conf), + TenantConfOpt::from(self.tenant_conf.clone()), self.generation, + &ShardParameters::default(), )) .unwrap(), // This is a legacy/test code path: sharding isn't supported here. ShardIdentity::unsharded(), - walredo_mgr, + Some(walredo_mgr), self.tenant_shard_id, - Some(self.remote_storage.clone()), + self.remote_storage.clone(), self.deletion_queue.new_client(), )); - match mode { - LoadMode::Local => { - tenant - .load_local(ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) - .await?; - } - LoadMode::Remote => { - let preload = tenant - .preload(&self.remote_storage, CancellationToken::new()) - .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) - .await?; - tenant - .attach(Some(preload), ctx) - .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())) - .await?; - } - } + let preload = tenant + .preload(&self.remote_storage, CancellationToken::new()) + .await?; + tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?; tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { @@ -4102,27 +3993,6 @@ pub(crate) mod harness { Ok(tenant) } - /// For tests that specifically want to exercise the local load path, which does - /// not use remote storage. - pub async fn try_load_local(&self, ctx: &RequestContext) -> anyhow::Result> { - self.do_try_load(ctx, LoadMode::Local).await - } - - /// The 'load' in this function is either a local load or a normal attachment, - pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result> { - // If we have nothing in remote storage, must use load_local instead of attach: attach - // will error out if there are no timelines. - // - // See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate - // this weird state of a Tenant which exists but doesn't have any timelines. - let mode = match self.remote_empty() { - true => LoadMode::Local, - false => LoadMode::Remote, - }; - - self.do_try_load(ctx, mode).await - } - pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { self.conf.timeline_path(&self.tenant_shard_id, timeline_id) } @@ -4143,37 +4013,61 @@ pub(crate) mod harness { records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, ) -> anyhow::Result { - let s = format!( - "redo for {} to get to {}, with {} and {} records", - key, - lsn, - if base_img.is_some() { - "base image" - } else { - "no base image" - }, - records.len() - ); - println!("{s}"); + let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); + if records_neon { + // For Neon wal records, we can decode without spawning postgres, so do so. + let base_img = base_img.expect("Neon WAL redo requires base image").1; + let mut page = BytesMut::new(); + page.extend_from_slice(&base_img); + for (record_lsn, record) in records { + apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?; + } + Ok(page.freeze()) + } else { + // We never spawn a postgres walredo process in unit tests: just log what we might have done. + let s = format!( + "redo for {} to get to {}, with {} and {} records", + key, + lsn, + if base_img.is_some() { + "base image" + } else { + "no base image" + }, + records.len() + ); + println!("{s}"); - Ok(TEST_IMG(&s)) + Ok(test_img(&s)) + } } } } #[cfg(test)] mod tests { + use std::collections::BTreeMap; + use super::*; use crate::keyspace::KeySpaceAccum; + use crate::pgdatadir_mapping::AuxFilesDirectory; use crate::repository::{Key, Value}; use crate::tenant::harness::*; + use crate::tenant::timeline::CompactFlags; + use crate::walrecord::NeonWalRecord; use crate::DEFAULT_PG_VERSION; - use crate::METADATA_FILE_NAME; - use bytes::BytesMut; + use bytes::{Bytes, BytesMut}; use hex_literal::hex; - use once_cell::sync::Lazy; + use itertools::Itertools; + use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + use pageserver_api::keyspace::KeySpace; + use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use rand::{thread_rng, Rng}; - use tokio_util::sync::CancellationToken; + use storage_layer::PersistentLayerKey; + use tests::storage_layer::ValuesReconstructState; + use tests::timeline::{GetVectoredError, ShutdownMode}; + use utils::bin_ser::BeSer; + use utils::id::TenantId; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -4185,24 +4079,24 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x10), - &Value::Image(TEST_IMG("foo at 0x10")), + &Value::Image(test_img("foo at 0x10")), &ctx, ) .await?; writer.finish_write(Lsn(0x10)); drop(writer); - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x20), - &Value::Image(TEST_IMG("foo at 0x20")), + &Value::Image(test_img("foo at 0x20")), &ctx, ) .await?; @@ -4211,15 +4105,15 @@ mod tests { assert_eq!( tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, - TEST_IMG("foo at 0x20") + test_img("foo at 0x20") ); Ok(()) @@ -4263,7 +4157,7 @@ mod tests { let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; #[allow(non_snake_case)] let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap(); @@ -4297,7 +4191,7 @@ mod tests { let newtline = tenant .get_timeline(NEW_TIMELINE_ID, true) .expect("Should have a local timeline"); - let new_writer = newtline.writer().await; + let mut new_writer = newtline.writer().await; new_writer .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx) .await?; @@ -4328,15 +4222,14 @@ mod tests { ctx: &RequestContext, ) -> anyhow::Result<()> { let mut lsn = start_lsn; - #[allow(non_snake_case)] { - let writer = tline.writer().await; + let mut writer = tline.writer().await; // Create a relation on the timeline writer .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4346,7 +4239,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4355,12 +4248,12 @@ mod tests { } tline.freeze_and_flush().await?; { - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; @@ -4370,13 +4263,13 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {}", lsn))), ctx, ) .await?; writer.finish_write(lsn); } - tline.freeze_and_flush().await + tline.freeze_and_flush().await.map_err(|e| e.into()) } #[tokio::test] @@ -4525,14 +4418,15 @@ mod tests { // Broken, as long as you don't need to access data from the parent. assert_eq!( newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?, - TEST_IMG(&format!("foo at {}", Lsn(0x70))) + test_img(&format!("foo at {}", Lsn(0x70))) ); // This needs to traverse to the parent, and fails. let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err(); - assert!(err - .to_string() - .contains("will not become active. Current state: Broken")); + assert!(err.to_string().starts_with(&format!( + "Bad state on timeline {}: Broken", + tline.timeline_id + ))); Ok(()) } @@ -4602,7 +4496,7 @@ mod tests { // Check that the data is still accessible on the branch. assert_eq!( newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?, - TEST_IMG(&format!("foo at {}", Lsn(0x40))) + test_img(&format!("foo at {}", Lsn(0x40))) ); Ok(()) @@ -4620,8 +4514,8 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?; // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) + .instrument(harness.span()) .await .ok() .unwrap(); @@ -4661,8 +4555,8 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) + .instrument(harness.span()) .await .ok() .unwrap(); @@ -4711,60 +4605,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn corrupt_local_metadata() -> anyhow::Result<()> { - const TEST_NAME: &str = "corrupt_metadata"; - let harness = TenantHarness::create(TEST_NAME)?; - let (tenant, ctx) = harness.load().await; - - let tline = tenant - .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) - .await?; - drop(tline); - // so that all uploads finish & we can call harness.try_load() below again - tenant - .shutdown(Default::default(), true) - .instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id)) - .await - .ok() - .unwrap(); - drop(tenant); - - // Corrupt local metadata - let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME); - assert!(metadata_path.is_file()); - let mut metadata_bytes = std::fs::read(&metadata_path)?; - assert_eq!(metadata_bytes.len(), 512); - metadata_bytes[8] ^= 1; - std::fs::write(metadata_path, metadata_bytes)?; - - let err = harness.try_load_local(&ctx).await.expect_err("should fail"); - // get all the stack with all .context, not only the last one - let message = format!("{err:#}"); - let expected = "failed to load metadata"; - assert!( - message.contains(expected), - "message '{message}' expected to contain {expected}" - ); - - let mut found_error_message = false; - let mut err_source = err.source(); - while let Some(source) = err_source { - if source.to_string().contains("metadata checksum mismatch") { - found_error_message = true; - break; - } - err_source = source.source(); - } - assert!( - found_error_message, - "didn't find the corrupted metadata error in {}", - message - ); - - Ok(()) - } - #[tokio::test] async fn test_images() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_images")?.load().await; @@ -4772,12 +4612,12 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x10), - &Value::Image(TEST_IMG("foo at 0x10")), + &Value::Image(test_img("foo at 0x10")), &ctx, ) .await?; @@ -4789,12 +4629,12 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x20), - &Value::Image(TEST_IMG("foo at 0x20")), + &Value::Image(test_img("foo at 0x20")), &ctx, ) .await?; @@ -4806,12 +4646,12 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x30), - &Value::Image(TEST_IMG("foo at 0x30")), + &Value::Image(test_img("foo at 0x30")), &ctx, ) .await?; @@ -4823,12 +4663,12 @@ mod tests { .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) .await?; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( *TEST_KEY, Lsn(0x40), - &Value::Image(TEST_IMG("foo at 0x40")), + &Value::Image(test_img("foo at 0x40")), &ctx, ) .await?; @@ -4842,28 +4682,96 @@ mod tests { assert_eq!( tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?, - TEST_IMG("foo at 0x10") + test_img("foo at 0x10") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?, - TEST_IMG("foo at 0x20") + test_img("foo at 0x20") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?, - TEST_IMG("foo at 0x30") + test_img("foo at 0x30") ); assert_eq!( tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?, - TEST_IMG("foo at 0x40") + test_img("foo at 0x40") ); Ok(()) } + async fn bulk_insert_compact_gc( + tenant: &Tenant, + timeline: &Arc, + ctx: &RequestContext, + lsn: Lsn, + repeat: usize, + key_count: usize, + ) -> anyhow::Result<()> { + let compact = true; + bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await + } + + async fn bulk_insert_maybe_compact_gc( + tenant: &Tenant, + timeline: &Arc, + ctx: &RequestContext, + mut lsn: Lsn, + repeat: usize, + key_count: usize, + compact: bool, + ) -> anyhow::Result<()> { + let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let mut blknum = 0; + + // Enforce that key range is monotonously increasing + let mut keyspace = KeySpaceAccum::new(); + + let cancel = CancellationToken::new(); + + for _ in 0..repeat { + for _ in 0..key_count { + test_key.field6 = blknum; + let mut writer = timeline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + + keyspace.add_key(test_key); + + lsn = Lsn(lsn.0 + 0x10); + blknum += 1; + } + + timeline.freeze_and_flush().await?; + if compact { + // this requires timeline to be &Arc + timeline.compact(&cancel, EnumSet::empty(), ctx).await?; + } + + // this doesn't really need to use the timeline_id target, but it is closer to what it + // originally was. + let res = tenant + .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx) + .await?; + + assert_eq!(res.layers_removed, 0, "this never removes anything"); + } + + Ok(()) + } + // // Insert 1000 key-value pairs with increasing keys, flush, compact, GC. // Repeat 50 times. @@ -4876,49 +4784,466 @@ mod tests { .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) .await?; - let mut lsn = Lsn(0x10); + let lsn = Lsn(0x10); + bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; - let mut keyspace = KeySpaceAccum::new(); + Ok(()) + } - let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); - let mut blknum = 0; - for _ in 0..50 { - for _ in 0..10000 { - test_key.field6 = blknum; - let writer = tline.writer().await; - writer - .put( - test_key, - lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), - &ctx, - ) - .await?; - writer.finish_write(lsn); - drop(writer); + // Test the vectored get real implementation against a simple sequential implementation. + // + // The test generates a keyspace by repeatedly flushing the in-memory layer and compacting. + // Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys + // grow to the right on the X axis. + // [Delta] + // [Delta] + // [Delta] + // [Delta] + // ------------ Image --------------- + // + // After layer generation we pick the ranges to query as follows: + // 1. The beginning of each delta layer + // 2. At the seam between two adjacent delta layers + // + // There's one major downside to this test: delta layers only contains images, + // so the search can stop at the first delta layer and doesn't traverse any deeper. + #[tokio::test] + async fn test_get_vectored() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; - keyspace.add_key(test_key); + let lsn = Lsn(0x10); + bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?; - lsn = Lsn(lsn.0 + 0x10); - blknum += 1; + let guard = tline.layers.read().await; + guard.layer_map().dump(true, &ctx).await?; + + let mut reads = Vec::new(); + let mut prev = None; + guard.layer_map().iter_historic_layers().for_each(|desc| { + if !desc.is_delta() { + prev = Some(desc.clone()); + return; } - let cutoff = tline.get_last_record_lsn(); + let start = desc.key_range.start; + let end = desc + .key_range + .start + .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap()); + reads.push(KeySpace { + ranges: vec![start..end], + }); + if let Some(prev) = &prev { + if !prev.is_delta() { + return; + } + + let first_range = Key { + field6: prev.key_range.end.field6 - 4, + ..prev.key_range.end + }..prev.key_range.end; + + let second_range = desc.key_range.start..Key { + field6: desc.key_range.start.field6 + 4, + ..desc.key_range.start + }; + + reads.push(KeySpace { + ranges: vec![first_range, second_range], + }); + }; + + prev = Some(desc.clone()); + }); + + drop(guard); + + // Pick a big LSN such that we query over all the changes. + let reads_lsn = Lsn(u64::MAX - 1); + + for read in reads { + info!("Doing vectored read on {:?}", read); + + let vectored_res = tline + .get_vectored_impl( + read.clone(), + reads_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) + .await; tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), + .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx) + .await; + } + + Ok(()) + } + + #[tokio::test] + async fn test_get_vectored_aux_files() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored_aux_files")?; + + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) + .await?; + let tline = tline.raw_timeline().unwrap(); + + let mut modification = tline.begin_modification(Lsn(0x1000)); + modification.put_file("foo/bar1", b"content1", &ctx).await?; + modification.set_lsn(Lsn(0x1008))?; + modification.put_file("foo/bar2", b"content2", &ctx).await?; + modification.commit(&ctx).await?; + + let child_timeline_id = TimelineId::generate(); + tenant + .branch_timeline_test( + tline, + child_timeline_id, + Some(tline.get_last_record_lsn()), + &ctx, + ) + .await?; + + let child_timeline = tenant + .get_timeline(child_timeline_id, true) + .expect("Should have the branched timeline"); + + let aux_keyspace = KeySpace { + ranges: vec![NON_INHERITED_RANGE], + }; + let read_lsn = child_timeline.get_last_record_lsn(); + + let vectored_res = child_timeline + .get_vectored_impl( + aux_keyspace.clone(), + read_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) + .await; + + child_timeline + .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx) + .await; + + let images = vectored_res?; + assert!(images.is_empty()); + Ok(()) + } + + // Test that vectored get handles layer gaps correctly + // by advancing into the next ancestor timeline if required. + // + // The test generates timelines that look like the diagram below. + // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram). + // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram). + // + // ``` + //-------------------------------+ + // ... | + // [ L1 ] | + // [ / L1 ] | Child Timeline + // ... | + // ------------------------------+ + // [ X L1 ] | Parent Timeline + // ------------------------------+ + // ``` + #[tokio::test] + async fn test_get_vectored_key_gap() -> anyhow::Result<()> { + let tenant_conf = TenantConf { + // Make compaction deterministic + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + // Encourage creation of L1 layers + checkpoint_distance: 16 * 1024, + compaction_target_size: 8 * 1024, + ..TenantConf::default() + }; + + let harness = TenantHarness::create_custom( + "test_get_vectored_key_gap", + tenant_conf, + TenantId::generate(), + ShardIdentity::unsharded(), + Generation::new(0xdeadbeef), + )?; + let (tenant, ctx) = harness.load().await; + + let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let gap_at_key = current_key.add(100); + let mut current_lsn = Lsn(0x10); + + const KEY_COUNT: usize = 10_000; + + let timeline_id = TimelineId::generate(); + let current_timeline = tenant + .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) + .await?; + + current_lsn += 0x100; + + let mut writer = current_timeline.writer().await; + writer + .put( + gap_at_key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + drop(writer); + + let mut latest_lsns = HashMap::new(); + latest_lsns.insert(gap_at_key, current_lsn); + + current_timeline.freeze_and_flush().await?; + + let child_timeline_id = TimelineId::generate(); + + tenant + .branch_timeline_test( + ¤t_timeline, + child_timeline_id, + Some(current_lsn), + &ctx, + ) + .await?; + let child_timeline = tenant + .get_timeline(child_timeline_id, true) + .expect("Should have the branched timeline"); + + for i in 0..KEY_COUNT { + if current_key == gap_at_key { + current_key = current_key.next(); + continue; + } + + current_lsn += 0x10; + + let mut writer = child_timeline.writer().await; + writer + .put( + current_key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))), &ctx, ) .await?; - tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + writer.finish_write(current_lsn); + drop(writer); + + latest_lsns.insert(current_key, current_lsn); + current_key = current_key.next(); + + // Flush every now and then to encourage layer file creation. + if i % 500 == 0 { + child_timeline.freeze_and_flush().await?; + } + } + + child_timeline.freeze_and_flush().await?; + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceRepartition); + child_timeline + .compact(&CancellationToken::new(), flags, &ctx) + .await?; + + let key_near_end = { + let mut tmp = current_key; + tmp.field6 -= 10; + tmp + }; + + let key_near_gap = { + let mut tmp = gap_at_key; + tmp.field6 -= 10; + tmp + }; + + let read = KeySpace { + ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key], + }; + let results = child_timeline + .get_vectored_impl( + read.clone(), + current_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) + .await?; + + for (key, img_res) in results { + let expected = test_img(&format!("{} at {}", key, latest_lsns[&key])); + assert_eq!(img_res?, expected); + } + + Ok(()) + } + + // Test that vectored get descends into ancestor timelines correctly and + // does not return an image that's newer than requested. + // + // The diagram below ilustrates an interesting case. We have a parent timeline + // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed + // from the child timeline, so the parent timeline must be visited. When advacing into + // the child timeline, the read path needs to remember what the requested Lsn was in + // order to avoid returning an image that's too new. The test below constructs such + // a timeline setup and does a few queries around the Lsn of each page image. + // ``` + // LSN + // ^ + // | + // | + // 500 | --------------------------------------> branch point + // 400 | X + // 300 | X + // 200 | --------------------------------------> requested lsn + // 100 | X + // |---------------------------------------> Key + // | + // ------> requested key + // + // Legend: + // * X - page images + // ``` + #[tokio::test] + async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?; + let (tenant, ctx) = harness.load().await; + + let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let end_key = start_key.add(1000); + let child_gap_at_key = start_key.add(500); + let mut parent_gap_lsns: BTreeMap = BTreeMap::new(); + + let mut current_lsn = Lsn(0x10); + + let timeline_id = TimelineId::generate(); + let parent_timeline = tenant + .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx) + .await?; + + current_lsn += 0x100; + + for _ in 0..3 { + let mut key = start_key; + while key < end_key { + current_lsn += 0x10; + + let image_value = format!("{} at {}", child_gap_at_key, current_lsn); + + let mut writer = parent_timeline.writer().await; + writer + .put( + key, + current_lsn, + &Value::Image(test_img(&image_value)), + &ctx, + ) + .await?; + writer.finish_write(current_lsn); + + if key == child_gap_at_key { + parent_gap_lsns.insert(current_lsn, image_value); + } + + key = key.next(); + } + + parent_timeline.freeze_and_flush().await?; + } + + let child_timeline_id = TimelineId::generate(); + + let child_timeline = tenant + .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx) + .await?; + + let mut key = start_key; + while key < end_key { + if key == child_gap_at_key { + key = key.next(); + continue; + } + + current_lsn += 0x10; + + let mut writer = child_timeline.writer().await; + writer + .put( + key, + current_lsn, + &Value::Image(test_img(&format!("{} at {}", key, current_lsn))), + &ctx, + ) .await?; - tline.gc().await?; + writer.finish_write(current_lsn); + + key = key.next(); + } + + child_timeline.freeze_and_flush().await?; + + let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10]; + let mut query_lsns = Vec::new(); + for image_lsn in parent_gap_lsns.keys().rev() { + for offset in lsn_offsets { + query_lsns.push(Lsn(image_lsn + .0 + .checked_add_signed(offset) + .expect("Shouldn't overflow"))); + } + } + + for query_lsn in query_lsns { + let results = child_timeline + .get_vectored_impl( + KeySpace { + ranges: vec![child_gap_at_key..child_gap_at_key.next()], + }, + query_lsn, + &mut ValuesReconstructState::new(), + &ctx, + ) + .await; + + let expected_item = parent_gap_lsns + .iter() + .rev() + .find(|(lsn, _)| **lsn <= query_lsn); + + info!( + "Doing vectored read at LSN {}. Expecting image to be: {:?}", + query_lsn, expected_item + ); + + match expected_item { + Some((_, img_value)) => { + let key_results = results.expect("No vectored get error expected"); + let key_result = &key_results[&child_gap_at_key]; + let returned_img = key_result + .as_ref() + .expect("No page reconstruct error expected"); + + info!( + "Vectored read at LSN {} returned image {}", + query_lsn, + std::str::from_utf8(returned_img)? + ); + assert_eq!(*returned_img, test_img(img_value)); + } + None => { + assert!(matches!(results, Err(GetVectoredError::MissingKey(_)))); + } + } } Ok(()) @@ -4926,15 +5251,36 @@ mod tests { #[tokio::test] async fn test_random_updates() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_random_updates")?; + let names_algorithms = [ + ("test_random_updates_legacy", CompactionAlgorithm::Legacy), + ("test_random_updates_tiered", CompactionAlgorithm::Tiered), + ]; + for (name, algorithm) in names_algorithms { + test_random_updates_algorithm(name, algorithm).await?; + } + Ok(()) + } + + async fn test_random_updates_algorithm( + name: &'static str, + compaction_algorithm: CompactionAlgorithm, + ) -> anyhow::Result<()> { + let mut harness = TenantHarness::create(name)?; + harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + kind: compaction_algorithm, + }; let (tenant, ctx) = harness.load().await; let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; const NUM_KEYS: usize = 1000; + let cancel = CancellationToken::new(); let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let mut test_key_end = test_key; + test_key_end.field6 = NUM_KEYS as u32; + tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end)); let mut keyspace = KeySpaceAccum::new(); @@ -4947,12 +5293,12 @@ mod tests { for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -4968,12 +5314,12 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -4987,26 +5333,15 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{} at {}", blknum, last_lsn)) ); } - // Perform a cycle of flush, compact, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; + // Perform a cycle of flush, and GC tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; - tline.gc().await?; } Ok(()) @@ -5027,6 +5362,8 @@ mod tests { let mut keyspace = KeySpaceAccum::new(); + let cancel = CancellationToken::new(); + // Track when each page was last modified. Used to assert that // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; @@ -5036,12 +5373,12 @@ mod tests { for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -5065,12 +5402,12 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), &ctx, ) .await?; @@ -5085,26 +5422,16 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{} at {}", blknum, last_lsn)) ); } // Perform a cycle of flush, compact, and GC - let cutoff = tline.get_last_record_lsn(); - tline - .update_gc_info( - Vec::new(), - cutoff, - Duration::ZERO, - &CancellationToken::new(), - &ctx, - ) - .await?; tline.freeze_and_flush().await?; - tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; - tline.gc().await?; } Ok(()) @@ -5142,12 +5469,12 @@ mod tests { lsn = Lsn(lsn.0 + 0x10); let blknum = thread_rng().gen_range(0..NUM_KEYS); test_key.field6 = blknum as u32; - let writer = tline.writer().await; + let mut writer = tline.writer().await; writer .put( test_key, lsn, - &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))), + &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))), &ctx, ) .await?; @@ -5169,7 +5496,7 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, *lsn, &ctx).await?, - TEST_IMG(&format!("{idx} {blknum} at {lsn}")) + test_img(&format!("{idx} {blknum} at {lsn}")) ); } } @@ -5245,19 +5572,19 @@ mod tests { } #[tokio::test] - async fn test_uninit_mark_crash() -> anyhow::Result<()> { - let name = "test_uninit_mark_crash"; + async fn test_create_guard_crash() -> anyhow::Result<()> { + let name = "test_create_guard_crash"; let harness = TenantHarness::create(name)?; { let (tenant, ctx) = harness.load().await; let tline = tenant .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) .await?; - // Keeps uninit mark in place + // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again let raw_tline = tline.raw_timeline().unwrap(); raw_tline - .shutdown() - .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id)) + .shutdown(super::timeline::ShutdownMode::Hard) + .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID)) .await; std::mem::forget(tline); } @@ -5269,7 +5596,7 @@ mod tests { assert_eq!( e, GetTimelineError::NotFound { - tenant_id: tenant.tenant_shard_id.tenant_id, + tenant_id: tenant.tenant_shard_id, timeline_id: TIMELINE_ID, } ) @@ -5281,10 +5608,1434 @@ mod tests { .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID) .exists()); - assert!(!harness - .conf - .timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID) - .exists()); + Ok(()) + } + + #[tokio::test] + async fn test_read_at_max_lsn() -> anyhow::Result<()> { + let names_algorithms = [ + ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy), + ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered), + ]; + for (name, algorithm) in names_algorithms { + test_read_at_max_lsn_algorithm(name, algorithm).await?; + } + Ok(()) + } + + async fn test_read_at_max_lsn_algorithm( + name: &'static str, + compaction_algorithm: CompactionAlgorithm, + ) -> anyhow::Result<()> { + let mut harness = TenantHarness::create(name)?; + harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings { + kind: compaction_algorithm, + }; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx) + .await?; + + let lsn = Lsn(0x10); + let compact = false; + bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?; + + let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + let read_lsn = Lsn(u64::MAX - 1); + + let result = tline.get(test_key, read_lsn, &ctx).await; + assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err()); + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_scan() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_scan")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 10000; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let mut test_key = base_key; + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0x10); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + updated[blknum] = lsn; + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for iter in 0..=10 { + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = (blknum * STEP) as u32; + assert_eq!( + tline.get(test_key, lsn, &ctx).await?, + test_img(&format!("{} at {}", blknum, last_lsn)) + ); + } + + let mut cnt = 0; + for (key, value) in tline + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::default(), + &ctx, + ) + .await? + { + let blknum = key.field6 as usize; + let value = value?; + assert!(blknum % STEP == 0); + let blknum = blknum / STEP; + assert_eq!( + value, + test_img(&format!("{} at {}", blknum, updated[blknum])) + ); + cnt += 1; + } + + assert_eq!(cnt, NUM_KEYS); + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Perform two cycles of flush, compact, and GC + for round in 0..2 { + tline.freeze_and_flush().await?; + tline + .compact( + &cancel, + if iter % 5 == 0 && round == 0 { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + } else { + EnumSet::empty() + }, + &ctx, + ) + .await?; + tenant + .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .await?; + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_compaction_trigger() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_compaction_trigger")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + let cancel = CancellationToken::new(); + + let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + base_key.field1 = AUX_KEY_PREFIX; + let test_key = base_key; + let mut lsn = Lsn(0x10); + + for _ in 0..20 { + lsn = Lsn(lsn.0 + 0x10); + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", 0, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + tline.freeze_and_flush().await?; // force create a delta layer + } + + let before_num_l0_delta_files = tline + .layers + .read() + .await + .layer_map() + .get_level0_deltas()? + .len(); + + tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + + let after_num_l0_delta_files = tline + .layers + .read() + .await + .layer_map() + .get_level0_deltas()? + .len(); + + assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}"); + + assert_eq!( + tline.get(test_key, lsn, &ctx).await?, + test_img(&format!("{} at {}", 0, lsn)) + ); + + Ok(()) + } + + #[tokio::test] + async fn test_branch_copies_dirty_aux_file_flag() { + let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap(); + + // the default aux file policy to switch is v1 if not set by the admins + assert_eq!( + harness.tenant_conf.switch_aux_file_policy, + AuxFilePolicy::V1 + ); + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // no aux file is written at this point, so the persistent flag should be unset + assert_eq!(tline.last_aux_file_policy.load(), None); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // there is no tenant manager to pass the configuration through, so lets mimic it + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V2, + "wanted state has been updated" + ); + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "aux file is written with switch_aux_file_policy unset (which is v1), so we should keep v1" + ); + + // we can read everything from the storage + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "keep v1 storage format when new files are written" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + + let child = tenant + .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx) + .await + .unwrap(); + + // child copies the last flag even if that is not on remote storage yet + assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2); + assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V1)); + + let files = child.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!(files.get("pg_logical/mappings/test1"), None); + assert_eq!(files.get("pg_logical/mappings/test2"), None); + + // even if we crash here without flushing parent timeline with it's new + // last_aux_file_policy we are safe, because child was never meant to access ancestor's + // files. the ancestor can even switch back to V1 because of a migration safely. + } + + #[tokio::test] + async fn aux_file_policy_switch() { + let mut harness = TenantHarness::create("aux_file_policy_switch").unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // there is no tenant manager to pass the configuration through, so lets mimic it + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V2, + "wanted state has been updated" + ); + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::CrossValidation), + "dirty index_part.json reflected state is yet to be updated" + ); + + // we can still read the auxfile v1 before we ingest anything new + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "ingesting a file should apply the wanted switch state when applicable" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")), + "cross validation writes to both v1 and v2 so this should be available in v2" + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + + // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file) + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V1), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"third", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.get_switch_aux_file_policy(), + AuxFilePolicy::V1, + "wanted state has been updated again, even if invalid request" + ); + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "ingesting a file should apply the wanted switch state when applicable" + ); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"third")) + ); + + // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file) + tenant.set_new_location_config( + AttachedTenantConf::try_from(LocationConf::attached_single( + TenantConfOpt { + switch_aux_file_policy: Some(AuxFilePolicy::V2), + ..Default::default() + }, + tenant.generation, + &pageserver_api::models::ShardParameters::default(), + )) + .unwrap(), + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test3", b"last", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2); + + assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2)); + + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"third")) + ); + assert_eq!( + files.get("pg_logical/mappings/test3"), + Some(&bytes::Bytes::from_static(b"last")) + ); + } + + #[tokio::test] + async fn aux_file_policy_force_switch() { + let mut harness = TenantHarness::create("aux_file_policy_force_switch").unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1; + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V2), + "dirty index_part.json reflected state is yet to be updated" + ); + + // lose all data from v1 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!(files.get("pg_logical/mappings/test1"), None); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test2", b"second", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + // read data ingested in v2 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test2"), + Some(&bytes::Bytes::from_static(b"second")) + ); + // lose all data from v1 + assert_eq!(files.get("pg_logical/mappings/test1"), None); + } + + #[tokio::test] + async fn aux_file_policy_auto_detect() { + let mut harness = TenantHarness::create("aux_file_policy_auto_detect").unwrap(); + harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode + let (tenant, ctx) = harness.load().await; + + let mut lsn = Lsn(0x08); + + let tline: Arc = tenant + .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + assert_eq!( + tline.last_aux_file_policy.load(), + None, + "no aux file is written so it should be unset" + ); + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { + files: vec![( + "test_file".to_string(), + Bytes::copy_from_slice(b"test_file"), + )] + .into_iter() + .collect(), + }) + .unwrap(); + modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + modification.commit(&ctx).await.unwrap(); + } + + { + lsn += 8; + let mut modification = tline.begin_modification(lsn); + modification + .put_file("pg_logical/mappings/test1", b"first", &ctx) + .await + .unwrap(); + modification.commit(&ctx).await.unwrap(); + } + + assert_eq!( + tline.last_aux_file_policy.load(), + Some(AuxFilePolicy::V1), + "keep using v1 because there are aux files writting with v1" + ); + + // we can still read the auxfile v1 + let files = tline.list_aux_files(lsn, &ctx).await.unwrap(); + assert_eq!( + files.get("pg_logical/mappings/test1"), + Some(&bytes::Bytes::from_static(b"first")) + ); + assert_eq!( + files.get("test_file"), + Some(&bytes::Bytes::from_static(b"test_file")) + ); + } + + #[tokio::test] + async fn test_metadata_image_creation() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_image_creation")?; + let (tenant, ctx) = harness.load().await; + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await?; + + const NUM_KEYS: usize = 1000; + const STEP: usize = 10000; // random update + scan base_key + idx * STEP + + let cancel = CancellationToken::new(); + + let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... + let mut test_key = base_key; + let mut lsn = Lsn(0x10); + + async fn scan_with_statistics( + tline: &Timeline, + keyspace: &KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result<(BTreeMap>, usize)> { + let mut reconstruct_state = ValuesReconstructState::default(); + let res = tline + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) + } + + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32)); + + for iter in 1..=10 { + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = (blknum * STEP) as u32; + let mut writer = tline.writer().await; + writer + .put( + test_key, + lsn, + &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &ctx, + ) + .await?; + writer.finish_write(lsn); + drop(writer); + } + + tline.freeze_and_flush().await?; + + if iter % 5 == 0 { + let (_, before_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await?; + let (_, after_delta_file_accessed) = + scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?; + assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}"); + // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances. + assert!( + after_delta_file_accessed <= 2, + "after_delta_file_accessed={after_delta_file_accessed}" + ); + } + } + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let (tenant, ctx) = harness.load().await; + + let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap(); + let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap(); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + Vec::new(), // delta layers + vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers + Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN + ) + .await?; + tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next()))); + + let child = tenant + .branch_timeline_test_with_layers( + &tline, + NEW_TIMELINE_ID, + Some(Lsn(0x20)), + &ctx, + Vec::new(), // delta layers + vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers + Lsn(0x30), + ) + .await + .unwrap(); + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + let lsn = Lsn(0x30); + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error()); + assert!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + Some(test_img("data key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("data key 2")) + ); + assert!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx) + .await + .unwrap_err() + .is_missing_key_error() + ); + + Ok(()) + } + + #[tokio::test] + async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let (tenant, ctx) = harness.load().await; + + let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap(); + assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix... + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + Vec::new(), // delta layers + vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers + Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN + ) + .await?; + + let child = tenant + .branch_timeline_test_with_layers( + &tline, + NEW_TIMELINE_ID, + Some(Lsn(0x20)), + &ctx, + Vec::new(), // delta layers + vec![( + Lsn(0x30), + vec![(base_key_child, test_img("metadata key 2"))], + )], // image layers + Lsn(0x30), + ) + .await + .unwrap(); + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + let lsn = Lsn(0x30); + + // test vectored get on parent timeline + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?, + Some(test_img("metadata key 1")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?, + None + ); + + // test vectored get on child timeline + assert_eq!( + get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?, + None + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?, + Some(test_img("metadata key 2")) + ); + assert_eq!( + get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?, + None + ); + + Ok(()) + } + + async fn get_vectored_impl_wrapper( + tline: &Arc, + key: Key, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result, GetVectoredError> { + let mut reconstruct_state = ValuesReconstructState::new(); + let mut res = tline + .get_vectored_impl( + KeySpace::single(key..key.next()), + lsn, + &mut reconstruct_state, + ctx, + ) + .await?; + Ok(res.pop_last().map(|(k, v)| { + assert_eq!(k, key); + v.unwrap() + })) + } + + #[tokio::test] + async fn test_metadata_tombstone_reads() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_metadata_tombstone_reads")?; + let (tenant, ctx) = harness.load().await; + let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); + let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); + let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap(); + + // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones + // Lsn 0x30 key0, key3, no key1+key2 + // Lsn 0x20 key1+key2 tomestones + // Lsn 0x10 key1 in image, key2 in delta + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + // delta layers + vec![ + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ], + // image layers + vec![ + (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]), + ( + Lsn(0x30), + vec![ + (key0, test_img("metadata key 0")), + (key3, test_img("metadata key 3")), + ], + ), + ], + Lsn(0x30), + ) + .await?; + + let lsn = Lsn(0x30); + let old_lsn = Lsn(0x20); + + assert_eq!( + get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?, + Some(test_img("metadata key 0")) + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?, + None, + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?, + None, + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?, + Some(Bytes::new()), + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?, + Some(Bytes::new()), + ); + assert_eq!( + get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?, + Some(test_img("metadata key 3")) + ); + + Ok(()) + } + + #[tokio::test] + async fn test_metadata_tombstone_image_creation() { + let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap(); + let (tenant, ctx) = harness.load().await; + + let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); + let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); + let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap(); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + // delta layers + vec![ + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + vec![ + (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), + (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), + ], + ], + // image layers + vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], + Lsn(0x30), + ) + .await + .unwrap(); + + let cancel = CancellationToken::new(); + + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await + .unwrap(); + + // Image layers are created at last_record_lsn + let images = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await + .unwrap() + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed. + } + + #[tokio::test] + async fn test_metadata_tombstone_empty_image_creation() { + let harness = + TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap(); + let (tenant, ctx) = harness.load().await; + + let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); + let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap(); + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + // delta layers + vec![ + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ], + // image layers + vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], + Lsn(0x30), + ) + .await + .unwrap(); + + let cancel = CancellationToken::new(); + + tline + .compact( + &cancel, + { + let mut flags = EnumSet::new(); + flags.insert(CompactFlags::ForceImageLayerCreation); + flags.insert(CompactFlags::ForceRepartition); + flags + }, + &ctx, + ) + .await + .unwrap(); + + // Image layers are created at last_record_lsn + let images = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await + .unwrap() + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon. + // + // | D1 | | D3 | + // -| |-- gc horizon ----------------- + // | | | D2 | + // --------- img layer ------------------ + // + // What we should expact from this compaction is: + // | Part of D1 | | D3 | + // --------- img layer with D1+D2 at GC horizon------------------ + + // img layer at 0x10 + let img_layer = (0..10) + .map(|id| (get_key(id), test_img(&format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose. + ( + get_key(1), + Lsn(0x20), + Value::Image(test_img("value 1@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::Image(test_img("value 2@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::Image(test_img("value 3@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::Image(test_img("value 5@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::Image(test_img("value 6@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x40), + Value::Image(test_img("value 8@0x40")), + ), + ( + get_key(9), + Lsn(0x40), + Value::Image(test_img("value 9@0x40")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![delta1, delta2, delta3], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.pitr = Lsn(0x30); + guard.cutoffs.horizon = Lsn(0x30); + } + + let cancel = CancellationToken::new(); + tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + + // Check if the image layer at the GC horizon contains exactly what we want + let image_at_gc_horizon = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await + .unwrap() + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + + assert_eq!(image_at_gc_horizon.len(), 10); + let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10]; + for idx in 0..10 { + assert_eq!( + image_at_gc_horizon[idx], + ( + get_key(idx as u32), + test_img(&format!("value {idx}@{:#x}", expected_lsn[idx])) + ) + ); + } + + // Check if old layers are removed / new layers have the expected LSN + let mut all_layers = tline.inspect_historic_layers().await.unwrap(); + all_layers.sort_by(|k1, k2| { + ( + k1.is_delta, + k1.key_range.start, + k1.key_range.end, + k1.lsn_range.start, + k1.lsn_range.end, + ) + .cmp(&( + k2.is_delta, + k2.key_range.start, + k2.key_range.end, + k2.lsn_range.start, + k2.lsn_range.end, + )) + }); + assert_eq!( + all_layers, + vec![ + // Image layer at GC horizon + PersistentLayerKey { + key_range: Key::MIN..get_key(10), + lsn_range: Lsn(0x30)..Lsn(0x31), + is_delta: false + }, + // The delta layer that is cut in the middle + PersistentLayerKey { + key_range: Key::MIN..get_key(9), + lsn_range: Lsn(0x30)..Lsn(0x41), + is_delta: true + }, + // The delta layer we created and should not be picked for the compaction + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x40)..Lsn(0x41), + is_delta: true + } + ] + ); + + Ok(()) + } + + #[tokio::test] + async fn test_neon_test_record() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_neon_test_record")?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(",0x20")), + ), + ( + get_key(1), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(",0x30")), + ), + (get_key(2), Lsn(0x10), Value::Image("0x10".into())), + ( + get_key(2), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append(",0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append(",0x30")), + ), + (get_key(3), Lsn(0x10), Value::Image("0x10".into())), + ( + get_key(3), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_clear()), + ), + (get_key(4), Lsn(0x10), Value::Image("0x10".into())), + ( + get_key(4), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_init()), + ), + ]; + let image1 = vec![(get_key(1), "0x10".into())]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![delta1], // delta layers + vec![(Lsn(0x10), image1)], // image layers + Lsn(0x50), + ) + .await?; + + assert_eq!( + tline.get(get_key(1), Lsn(0x50), &ctx).await?, + Bytes::from_static(b"0x10,0x20,0x30") + ); + assert_eq!( + tline.get(get_key(2), Lsn(0x50), &ctx).await?, + Bytes::from_static(b"0x10,0x20,0x30") + ); + // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new()); + // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new()); + + Ok(()) + } + + #[tokio::test] + async fn test_lsn_lease() -> anyhow::Result<()> { + let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await; + let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); + + let end_lsn = Lsn(0x100); + let image_layers = (0x20..=0x90) + .step_by(0x10) + .map(|n| { + ( + Lsn(n), + vec![(key, test_img(&format!("data key at {:x}", n)))], + ) + }) + .collect(); + + let timeline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + Vec::new(), + image_layers, + end_lsn, + ) + .await?; + + let leased_lsns = [0x30, 0x50, 0x70]; + let mut leases = Vec::new(); + let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| { + leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?); + Ok(()) + }); + + // Renewing with shorter lease should not change the lease. + let updated_lease_0 = + timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?; + assert_eq!(updated_lease_0.valid_until, leases[0].valid_until); + + // Renewing with a long lease should renew lease with later expiration time. + let updated_lease_1 = timeline.make_lsn_lease( + Lsn(leased_lsns[1]), + timeline.get_lsn_lease_length() * 2, + &ctx, + )?; + + assert!(updated_lease_1.valid_until > leases[1].valid_until); + + // Force set disk consistent lsn so we can get the cutoff at `end_lsn`. + info!( + "latest_gc_cutoff_lsn: {}", + *timeline.get_latest_gc_cutoff_lsn() + ); + timeline.force_set_disk_consistent_lsn(end_lsn); + + let res = tenant + .gc_iteration( + Some(TIMELINE_ID), + 0, + Duration::ZERO, + &CancellationToken::new(), + &ctx, + ) + .await?; + + // Keeping everything <= Lsn(0x80) b/c leases: + // 0/10: initdb layer + // (0/20..=0/70).step_by(0x10): image layers added when creating the timeline. + assert_eq!(res.layers_needed_by_leases, 7); + // Keeping 0/90 b/c it is the latest layer. + assert_eq!(res.layers_not_updated, 1); + // Removed 0/80. + assert_eq!(res.layers_removed, 1); + + // Make lease on a already GC-ed LSN. + // 0/80 does not have a valid lease + is below latest_gc_cutoff + assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn()); + let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx); + assert!(res.is_err()); + + // Should still be able to renew a currently valid lease + // Assumption: original lease to is still valid for 0/50. + let _ = + timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?; Ok(()) } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 6de2e95055..2be8816cef 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -11,6 +11,9 @@ //! len < 128: 0XXXXXXX //! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use bytes::{BufMut, BytesMut}; +use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; + use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; use crate::tenant::block_io::BlockCursor; @@ -100,6 +103,8 @@ pub struct BlobWriter { offset: u64, /// A buffer to save on write calls, only used if BUFFERED=true buf: Vec, + /// We do tiny writes for the length headers; they need to be in an owned buffer; + io_buf: Option, } impl BlobWriter { @@ -108,6 +113,7 @@ impl BlobWriter { inner, offset: start_offset, buf: Vec::with_capacity(Self::CAPACITY), + io_buf: Some(BytesMut::new()), } } @@ -115,23 +121,34 @@ impl BlobWriter { self.offset } - const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 }; + const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 }; - #[inline(always)] /// Writes the given buffer directly to the underlying `VirtualFile`. /// You need to make sure that the internal buffer is empty, otherwise /// data will be written in wrong order. - async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> { - self.inner.write_all(src_buf).await?; - self.offset += src_buf.len() as u64; - Ok(()) + #[inline(always)] + async fn write_all_unbuffered, Buf: IoBuf + Send>( + &mut self, + src_buf: B, + ctx: &RequestContext, + ) -> (B::Buf, Result<(), Error>) { + let (src_buf, res) = self.inner.write_all(src_buf, ctx).await; + let nbytes = match res { + Ok(nbytes) => nbytes, + Err(e) => return (src_buf, Err(e)), + }; + self.offset += nbytes as u64; + (src_buf, Ok(())) } #[inline(always)] /// Flushes the internal buffer to the underlying `VirtualFile`. - pub async fn flush_buffer(&mut self) -> Result<(), Error> { - self.inner.write_all(&self.buf).await?; - self.buf.clear(); + pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> { + let buf = std::mem::take(&mut self.buf); + let (mut buf, res) = self.inner.write_all(buf, ctx).await; + res?; + buf.clear(); + self.buf = buf; Ok(()) } @@ -146,62 +163,102 @@ impl BlobWriter { } /// Internal, possibly buffered, write function - async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + src_buf: B, + ctx: &RequestContext, + ) -> (B::Buf, Result<(), Error>) { if !BUFFERED { assert!(self.buf.is_empty()); - self.write_all_unbuffered(src_buf).await?; - return Ok(()); + return self.write_all_unbuffered(src_buf, ctx).await; } let remaining = Self::CAPACITY - self.buf.len(); + let src_buf_len = src_buf.bytes_init(); + if src_buf_len == 0 { + return (Slice::into_inner(src_buf.slice_full()), Ok(())); + } + let mut src_buf = src_buf.slice(0..src_buf_len); // First try to copy as much as we can into the buffer if remaining > 0 { - let copied = self.write_into_buffer(src_buf); - src_buf = &src_buf[copied..]; + let copied = self.write_into_buffer(&src_buf); + src_buf = src_buf.slice(copied..); } // Then, if the buffer is full, flush it out if self.buf.len() == Self::CAPACITY { - self.flush_buffer().await?; + if let Err(e) = self.flush_buffer(ctx).await { + return (Slice::into_inner(src_buf), Err(e)); + } } // Finally, write the tail of src_buf: // If it wholly fits into the buffer without // completely filling it, then put it there. // If not, write it out directly. - if !src_buf.is_empty() { + let src_buf = if !src_buf.is_empty() { assert_eq!(self.buf.len(), 0); if src_buf.len() < Self::CAPACITY { - let copied = self.write_into_buffer(src_buf); + let copied = self.write_into_buffer(&src_buf); // We just verified above that src_buf fits into our internal buffer. assert_eq!(copied, src_buf.len()); + Slice::into_inner(src_buf) } else { - self.write_all_unbuffered(src_buf).await?; + let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await; + if let Err(e) = res { + return (src_buf, Err(e)); + } + src_buf } - } - Ok(()) + } else { + Slice::into_inner(src_buf) + }; + (src_buf, Ok(())) } /// Write a blob of data. Returns the offset that it was written to, /// which can be used to retrieve the data later. - pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result { + pub async fn write_blob, Buf: IoBuf + Send>( + &mut self, + srcbuf: B, + ctx: &RequestContext, + ) -> (B::Buf, Result) { let offset = self.offset; - if srcbuf.len() < 128 { - // Short blob. Write a 1-byte length header - let len_buf = srcbuf.len() as u8; - self.write_all(&[len_buf]).await?; - } else { - // Write a 4-byte length header - if srcbuf.len() > 0x7fff_ffff { - return Err(Error::new( - ErrorKind::Other, - format!("blob too large ({} bytes)", srcbuf.len()), - )); + let len = srcbuf.bytes_init(); + + let mut io_buf = self.io_buf.take().expect("we always put it back below"); + io_buf.clear(); + let (io_buf, hdr_res) = async { + if len < 128 { + // Short blob. Write a 1-byte length header + io_buf.put_u8(len as u8); + self.write_all(io_buf, ctx).await + } else { + // Write a 4-byte length header + if len > 0x7fff_ffff { + return ( + io_buf, + Err(Error::new( + ErrorKind::Other, + format!("blob too large ({len} bytes)"), + )), + ); + } + if len > 0x0fff_ffff { + tracing::warn!("writing blob above future limit ({len} bytes)"); + } + let mut len_buf = (len as u32).to_be_bytes(); + len_buf[0] |= 0x80; + io_buf.extend_from_slice(&len_buf[..]); + self.write_all(io_buf, ctx).await } - let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes(); - len_buf[0] |= 0x80; - self.write_all(&len_buf).await?; } - self.write_all(srcbuf).await?; - Ok(offset) + .await; + self.io_buf = Some(io_buf); + match hdr_res { + Ok(_) => (), + Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)), + } + let (srcbuf, res) = self.write_all(srcbuf, ctx).await; + (srcbuf, res.map(|_| offset)) } } @@ -210,8 +267,8 @@ impl BlobWriter { /// /// This function flushes the internal buffer before giving access /// to the underlying `VirtualFile`. - pub async fn into_inner(mut self) -> Result { - self.flush_buffer().await?; + pub async fn into_inner(mut self, ctx: &RequestContext) -> Result { + self.flush_buffer(ctx).await?; Ok(self.inner) } @@ -245,20 +302,22 @@ mod tests { // Write part (in block to drop the file) let mut offsets = Vec::new(); { - let file = VirtualFile::create(pathbuf.as_path()).await?; + let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { - let offs = wtr.write_blob(blob).await?; + let (_, res) = wtr.write_blob(blob.clone(), &ctx).await; + let offs = res?; offsets.push(offs); } // Write out one page worth of zeros so that we can // read again with read_blk - let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?; + let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await; + let offs = res?; println!("Writing final blob at offs={offs}"); - wtr.flush_buffer().await?; + wtr.flush_buffer(&ctx).await?; } - let file = VirtualFile::open(pathbuf.as_path()).await?; + let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?; let rdr = BlockReaderRef::VirtualFile(&file); let rdr = BlockCursor::new(rdr); for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 0617017528..92928116c1 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -5,10 +5,10 @@ use super::ephemeral_file::EphemeralFile; use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; -use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ}; +use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; use crate::virtual_file::VirtualFile; use bytes::Bytes; -use std::ops::{Deref, DerefMut}; +use std::ops::Deref; /// This is implemented by anything that can read 8 kB (PAGE_SZ) /// blocks, using the page cache @@ -39,6 +39,8 @@ pub enum BlockLease<'a> { EphemeralFileMutableTail(&'a [u8; PAGE_SZ]), #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), + #[cfg(test)] + Vec(Vec), } impl From> for BlockLease<'static> { @@ -63,6 +65,10 @@ impl<'a> Deref for BlockLease<'a> { BlockLease::EphemeralFileMutableTail(v) => v, #[cfg(test)] BlockLease::Arc(v) => v.deref(), + #[cfg(test)] + BlockLease::Vec(v) => { + TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ") + } } } } @@ -72,7 +78,7 @@ impl<'a> Deref for BlockLease<'a> { /// /// Unlike traits, we also support the read function to be async though. pub(crate) enum BlockReaderRef<'a> { - FileBlockReader(&'a FileBlockReader), + FileBlockReader(&'a FileBlockReader<'a>), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), #[cfg(test)] @@ -96,7 +102,7 @@ impl<'a> BlockReaderRef<'a> { #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] - VirtualFile(r) => r.read_blk(blknum).await, + VirtualFile(r) => r.read_blk(blknum, ctx).await, } } } @@ -154,25 +160,28 @@ impl<'a> BlockCursor<'a> { /// /// The file is assumed to be immutable. This doesn't provide any functions /// for modifying the file, nor for invalidating the cache if it is modified. -pub struct FileBlockReader { - pub file: VirtualFile, +pub struct FileBlockReader<'a> { + pub file: &'a VirtualFile, /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, } -impl FileBlockReader { - pub fn new(file: VirtualFile) -> Self { - let file_id = page_cache::next_file_id(); - +impl<'a> FileBlockReader<'a> { + pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self { FileBlockReader { file_id, file } } /// Read a page from the underlying file into given buffer. - async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> { + async fn fill_buffer( + &self, + buf: PageWriteGuard<'static>, + blkno: u32, + ctx: &RequestContext, + ) -> Result, std::io::Error> { assert!(buf.len() == PAGE_SZ); self.file - .read_exact_at(buf, blkno as u64 * PAGE_SZ as u64) + .read_exact_at_page(buf, blkno as u64 * PAGE_SZ as u64, ctx) .await } /// Read a block. @@ -180,11 +189,11 @@ impl FileBlockReader { /// Returns a "lease" object that can be used to /// access to the contents of the page. (For the page cache, the /// lease object represents a lock on the buffer.) - pub async fn read_blk( + pub async fn read_blk<'b>( &self, blknum: u32, ctx: &RequestContext, - ) -> Result { + ) -> Result, std::io::Error> { let cache = page_cache::get(); match cache .read_immutable_buf(self.file_id, blknum, ctx) @@ -196,16 +205,16 @@ impl FileBlockReader { ) })? { ReadBufResult::Found(guard) => Ok(guard.into()), - ReadBufResult::NotFound(mut write_guard) => { + ReadBufResult::NotFound(write_guard) => { // Read the page from disk into the buffer - self.fill_buffer(write_guard.deref_mut(), blknum).await?; + let write_guard = self.fill_buffer(write_guard, blknum, ctx).await?; Ok(write_guard.mark_valid().into()) } } } } -impl BlockReader for FileBlockReader { +impl BlockReader for FileBlockReader<'_> { fn block_cursor(&self) -> BlockCursor<'_> { BlockCursor::new(BlockReaderRef::FileBlockReader(self)) } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 2d4cd350d7..1b9be12642 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -9,7 +9,12 @@ //! may lead to a data loss. //! use anyhow::bail; -use pageserver_api::models; +use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::CompactionAlgorithm; +use pageserver_api::models::CompactionAlgorithmSettings; +use pageserver_api::models::EvictionPolicy; +use pageserver_api::models::LsnLease; +use pageserver_api::models::{self, ThrottleConfig}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; @@ -19,6 +24,7 @@ use std::time::Duration; use utils::generation::Generation; pub mod defaults { + // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB // would be more appropriate. But a low value forces the code to be exercised more, // which is good for now to trigger bugs. @@ -26,12 +32,17 @@ pub mod defaults { pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; + // FIXME the below configs are only used by legacy algorithm. The new algorithm + // has different parameters. + // Target file size, when creating image and delta layers. // This parameter determines L1 layer file size. pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm = + super::CompactionAlgorithm::Legacy; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; @@ -44,13 +55,19 @@ pub mod defaults { pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024; + // The default limit on WAL lag should be set to avoid causing disconnects under high throughput + // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for + // throughputs up to 1GiB/s per timeline. + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + // By default ingest enough WAL for two new L0 layers before checking if new image + // image layers should be created. + pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum AttachmentMode { /// Our generation is current as far as we know, and as far as we know we are the only attached /// pageserver. This is the "normal" attachment mode. @@ -65,7 +82,7 @@ pub(crate) enum AttachmentMode { Stale, } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) struct AttachedLocationConfig { pub(crate) generation: Generation, pub(crate) attach_mode: AttachmentMode, @@ -167,14 +184,17 @@ impl LocationConf { /// For use when loading from a legacy configuration: presence of a tenant /// implies it is in AttachmentMode::Single, which used to be the only /// possible state. This function should eventually be removed. - pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self { + pub(crate) fn attached_single( + tenant_conf: TenantConfOpt, + generation: Generation, + shard_params: &models::ShardParameters, + ) -> Self { Self { mode: LocationMode::Attached(AttachedLocationConfig { generation, attach_mode: AttachmentMode::Single, }), - // Legacy configuration loads are always from tenants created before sharding existed. - shard: ShardIdentity::unsharded(), + shard: ShardIdentity::from_params(ShardNumber(0), shard_params), tenant_conf, } } @@ -182,16 +202,17 @@ impl LocationConf { /// For use when attaching/re-attaching: update the generation stored in this /// structure. If we were in a secondary state, promote to attached (posession /// of a fresh generation implies this). - pub(crate) fn attach_in_generation(&mut self, generation: Generation) { + pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) { match &mut self.mode { LocationMode::Attached(attach_conf) => { attach_conf.generation = generation; + attach_conf.attach_mode = mode; } LocationMode::Secondary(_) => { // We are promoted to attached by the control plane's re-attach response self.mode = LocationMode::Attached(AttachedLocationConfig { generation, - attach_mode: AttachmentMode::Single, + attach_mode: mode, }) } } @@ -247,7 +268,7 @@ impl LocationConf { } else { ShardIdentity::new( ShardNumber(conf.shard_number), - ShardCount(conf.shard_count), + ShardCount::new(conf.shard_count), ShardStripeSize(conf.shard_stripe_size), )? }; @@ -281,7 +302,7 @@ impl Default for LocationConf { /// /// For storing and transmitting individual tenant's configuration, see /// TenantConfOpt. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct TenantConf { // Flush out an inmemory layer, if it's holding WAL older than this // This puts a backstop on how much WAL needs to be re-digested if the @@ -301,6 +322,7 @@ pub struct TenantConf { pub compaction_period: Duration, // Level0 delta layer threshold for compaction. pub compaction_threshold: usize, + pub compaction_algorithm: CompactionAlgorithmSettings, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is #of bytes of WAL. @@ -335,17 +357,42 @@ pub struct TenantConf { // See the corresponding metric's help string. #[serde(with = "humantime_serde")] pub evictions_low_residence_duration_metric_threshold: Duration, - pub gc_feedback: bool, /// If non-zero, the period between uploads of a heatmap from attached tenants. This /// may be disabled if a Tenant will not have secondary locations: only secondary /// locations will use the heatmap uploaded by attached locations. + #[serde(with = "humantime_serde")] pub heatmap_period: Duration, + + /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup + pub lazy_slru_download: bool, + + pub timeline_get_throttle: pageserver_api::models::ThrottleConfig, + + // How much WAL must be ingested before checking again whether a new image layer is required. + // Expresed in multiples of checkpoint distance. + pub image_layer_creation_check_threshold: u8, + + /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into + /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. + /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux + /// file is written. + pub switch_aux_file_policy: AuxFilePolicy, + + /// The length for an explicit LSN lease request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Duration, + + /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Duration, } /// Same as TenantConf, but this struct preserves the information about /// which parameters are set and which are not. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] @@ -369,6 +416,10 @@ pub struct TenantConfOpt { #[serde(default)] pub compaction_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub compaction_algorithm: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub gc_horizon: Option, @@ -419,37 +470,33 @@ pub struct TenantConfOpt { pub evictions_low_residence_duration_metric_threshold: Option, #[serde(skip_serializing_if = "Option::is_none")] + #[serde(with = "humantime_serde")] #[serde(default)] - pub gc_feedback: Option, + pub heatmap_period: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub lazy_slru_download: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub timeline_get_throttle: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub image_layer_creation_check_threshold: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub switch_aux_file_policy: Option, #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] #[serde(default)] - pub heatmap_period: Option, -} + pub lsn_lease_length: Option, -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(tag = "kind")] -pub enum EvictionPolicy { - NoEviction, - LayerAccessThreshold(EvictionPolicyLayerAccessThreshold), -} - -impl EvictionPolicy { - pub fn discriminant_str(&self) -> &'static str { - match self { - EvictionPolicy::NoEviction => "NoEviction", - EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold", - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub struct EvictionPolicyLayerAccessThreshold { + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] - pub period: Duration, - #[serde(with = "humantime_serde")] - pub threshold: Duration, + #[serde(default)] + pub lsn_lease_length_for_ts: Option, } impl TenantConfOpt { @@ -470,6 +517,11 @@ impl TenantConfOpt { compaction_threshold: self .compaction_threshold .unwrap_or(global_conf.compaction_threshold), + compaction_algorithm: self + .compaction_algorithm + .as_ref() + .unwrap_or(&global_conf.compaction_algorithm) + .clone(), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), image_creation_threshold: self @@ -493,8 +545,26 @@ impl TenantConfOpt { evictions_low_residence_duration_metric_threshold: self .evictions_low_residence_duration_metric_threshold .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold), - gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback), heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period), + lazy_slru_download: self + .lazy_slru_download + .unwrap_or(global_conf.lazy_slru_download), + timeline_get_throttle: self + .timeline_get_throttle + .clone() + .unwrap_or(global_conf.timeline_get_throttle), + image_layer_creation_check_threshold: self + .image_layer_creation_check_threshold + .unwrap_or(global_conf.image_layer_creation_check_threshold), + switch_aux_file_policy: self + .switch_aux_file_policy + .unwrap_or(global_conf.switch_aux_file_policy), + lsn_lease_length: self + .lsn_lease_length + .unwrap_or(global_conf.lsn_lease_length), + lsn_lease_length_for_ts: self + .lsn_lease_length_for_ts + .unwrap_or(global_conf.lsn_lease_length_for_ts), } } } @@ -510,6 +580,9 @@ impl Default for TenantConf { compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) .expect("cannot parse default compaction period"), compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + compaction_algorithm: CompactionAlgorithmSettings { + kind: DEFAULT_COMPACTION_ALGORITHM, + }, gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), @@ -531,8 +604,13 @@ impl Default for TenantConf { DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, ) .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), - gc_feedback: false, heatmap_period: Duration::ZERO, + lazy_slru_download: false, + timeline_get_throttle: crate::tenant::throttle::Config::disabled(), + image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + switch_aux_file_policy: AuxFilePolicy::default_tenant_config(), + lsn_lease_length: LsnLease::DEFAULT_LENGTH, + lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, } } } @@ -576,6 +654,44 @@ impl TryFrom for TenantConfOpt { } } +/// This is a conversion from our internal tenant config object to the one used +/// in external APIs. +impl From for models::TenantConfig { + fn from(value: TenantConfOpt) -> Self { + fn humantime(d: Duration) -> String { + format!("{}s", d.as_secs()) + } + Self { + checkpoint_distance: value.checkpoint_distance, + checkpoint_timeout: value.checkpoint_timeout.map(humantime), + compaction_algorithm: value.compaction_algorithm, + compaction_target_size: value.compaction_target_size, + compaction_period: value.compaction_period.map(humantime), + compaction_threshold: value.compaction_threshold, + gc_horizon: value.gc_horizon, + gc_period: value.gc_period.map(humantime), + image_creation_threshold: value.image_creation_threshold, + pitr_interval: value.pitr_interval.map(humantime), + walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime), + lagging_wal_timeout: value.lagging_wal_timeout.map(humantime), + max_lsn_wal_lag: value.max_lsn_wal_lag, + trace_read_requests: value.trace_read_requests, + eviction_policy: value.eviction_policy, + min_resident_size_override: value.min_resident_size_override, + evictions_low_residence_duration_metric_threshold: value + .evictions_low_residence_duration_metric_threshold + .map(humantime), + heatmap_period: value.heatmap_period.map(humantime), + lazy_slru_download: value.lazy_slru_download, + timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), + image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, + switch_aux_file_policy: value.switch_aux_file_policy, + lsn_lease_length: value.lsn_lease_length.map(humantime), + lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 2f606ed822..8b36aa15e5 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -3,18 +3,22 @@ use std::sync::Arc; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::{models::TenantState, shard::TenantShardId}; -use remote_storage::{GenericRemoteStorage, RemotePath}; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio::sync::OwnedMutexGuard; use tokio_util::sync::CancellationToken; -use tracing::{error, instrument, Instrument, Span}; +use tracing::{error, instrument, Instrument}; -use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId}; +use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint}; use crate::{ config::PageServerConf, context::RequestContext, task_mgr::{self, TaskKind}, - tenant::mgr::{TenantSlot, TenantsMapRemoveResult}, + tenant::{ + mgr::{TenantSlot, TenantsMapRemoveResult}, + remote_timeline_client::remote_heatmap_path, + timeline::ShutdownMode, + }, }; use super::{ @@ -84,16 +88,18 @@ async fn create_remote_delete_mark( let data = bytes::Bytes::from_static(data); let stream = futures::stream::once(futures::future::ready(Ok(data))); remote_storage - .upload(stream, 0, &remote_mark_path, None) + .upload(stream, 0, &remote_mark_path, None, cancel) .await }, - |_e| false, + TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "mark_upload", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), + cancel, ) .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) .context("mark_upload")?; Ok(()) @@ -109,6 +115,7 @@ async fn create_local_delete_mark( let _ = std::fs::OpenOptions::new() .write(true) .create(true) + .truncate(true) .open(&marker_path) .with_context(|| format!("could not create delete marker file {marker_path:?}"))?; @@ -136,7 +143,11 @@ async fn schedule_ordered_timeline_deletions( let mut already_running_deletions = vec![]; for (timeline_id, _) in sorted.into_iter().rev() { - if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await { + let span = tracing::info_span!("timeline_delete", %timeline_id); + let res = DeleteTimelineFlow::run(tenant, timeline_id, true) + .instrument(span) + .await; + if let Err(e) = res { match e { DeleteTimelineError::NotFound => { // Timeline deletion finished after call to clone above but before call @@ -171,23 +182,23 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del async fn remove_tenant_remote_delete_mark( conf: &PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, + remote_storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { - if let Some(remote_storage) = remote_storage { - let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; - backoff::retry( - || async { remote_storage.delete(&path).await }, - |_e| false, - FAILED_UPLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "remove_tenant_remote_delete_mark", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), - ) - .await - .context("remove_tenant_remote_delete_mark")?; - } + let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?; + backoff::retry( + || async { remote_storage.delete(&path, cancel).await }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "remove_tenant_remote_delete_mark", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("remove_tenant_remote_delete_mark")?; Ok(()) } @@ -238,6 +249,8 @@ async fn cleanup_remaining_fs_traces( rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?; + rm(conf.tenant_heatmap_path(tenant_shard_id), false).await?; + fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| { Err(anyhow::anyhow!( "failpoint: tenant-delete-before-remove-tenant-dir" @@ -283,9 +296,10 @@ impl DeleteTenantFlow { #[instrument(skip_all)] pub(crate) async fn run( conf: &'static PageServerConf, - remote_storage: Option, + remote_storage: GenericRemoteStorage, tenants: &'static std::sync::RwLock, tenant: Arc, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { span::debug_assert_current_span_has_tenant_id(); @@ -293,7 +307,7 @@ impl DeleteTenantFlow { let mut guard = Self::prepare(&tenant).await?; - if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await { + if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await { tenant.set_broken(format!("{e:#}")).await; return Err(e); } @@ -310,8 +324,9 @@ impl DeleteTenantFlow { async fn run_inner( guard: &mut OwnedMutexGuard, conf: &'static PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, + remote_storage: &GenericRemoteStorage, tenant: &Tenant, + cancel: &CancellationToken, ) -> Result<(), DeleteTenantError> { guard.mark_in_progress()?; @@ -321,20 +336,9 @@ impl DeleteTenantFlow { ))? }); - // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend. - // Though sounds scary, different mark name? - // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. - if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark( - conf, - remote_storage, - &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), - ) + create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel) .await - .context("remote_mark")? - } + .context("remote_mark")?; fail::fail_point!("tenant-delete-before-create-local-mark", |_| { Err(anyhow::anyhow!( @@ -409,7 +413,10 @@ impl DeleteTenantFlow { .await .expect("cant be stopping or broken"); - tenant.attach(preload, ctx).await.context("attach")?; + tenant + .attach(preload, super::SpawnMode::Eager, ctx) + .await + .context("attach")?; Self::background( guard, @@ -421,6 +428,11 @@ impl DeleteTenantFlow { .await } + /// Check whether background deletion of this tenant is currently in progress + pub(crate) fn is_in_progress(tenant: &Tenant) -> bool { + tenant.delete_progress.try_lock().is_err() + } + async fn prepare( tenant: &Arc, ) -> Result, DeleteTenantError> { @@ -451,7 +463,7 @@ impl DeleteTenantFlow { // tenant.shutdown // Its also bad that we're holding tenants.read here. // TODO relax set_stopping to be idempotent? - if tenant.shutdown(progress, false).await.is_err() { + if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() { return Err(DeleteTenantError::Other(anyhow::anyhow!( "tenant shutdown is already in progress" ))); @@ -463,7 +475,7 @@ impl DeleteTenantFlow { fn schedule_background( guard: OwnedMutexGuard, conf: &'static PageServerConf, - remote_storage: Option, + remote_storage: GenericRemoteStorage, tenants: &'static std::sync::RwLock, tenant: Arc, ) { @@ -485,18 +497,14 @@ impl DeleteTenantFlow { }; Ok(()) } - .instrument({ - let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()); - span.follows_from(Span::current()); - span - }), + .instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), ); } async fn background( mut guard: OwnedMutexGuard, conf: &PageServerConf, - remote_storage: Option, + remote_storage: GenericRemoteStorage, tenants: &'static std::sync::RwLock, tenant: &Arc, ) -> Result<(), DeleteTenantError> { @@ -524,6 +532,25 @@ impl DeleteTenantFlow { } } + // Remove top-level tenant objects that don't belong to a timeline, such as heatmap + let heatmap_path = remote_heatmap_path(&tenant.tenant_shard_id()); + if let Some(Err(e)) = backoff::retry( + || async { + remote_storage + .delete(&heatmap_path, &task_mgr::shutdown_token()) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "remove_remote_tenant_heatmap", + &task_mgr::shutdown_token(), + ) + .await + { + tracing::warn!("Failed to delete heatmap at {heatmap_path}: {e}"); + } + let timelines_path = conf.timelines_path(&tenant.tenant_shard_id); // May not exist if we fail in cleanup_remaining_fs_traces after removing it if timelines_path.exists() { @@ -535,13 +562,13 @@ impl DeleteTenantFlow { remove_tenant_remote_delete_mark( conf, - remote_storage.as_ref(), + &remote_storage, &tenant.tenant_shard_id, - // Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token - &CancellationToken::new(), + &task_mgr::shutdown_token(), ) .await?; + pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable"); fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| { Err(anyhow::anyhow!( "failpoint: tenant-delete-before-cleanup-remaining-fs-traces" @@ -569,9 +596,20 @@ impl DeleteTenantFlow { // FIXME: we should not be modifying this from outside of mgr.rs. // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) - crate::metrics::TENANT_MANAGER - .tenant_slots - .set(locked.len() as u64); + + // Update stats + match &removed { + TenantsMapRemoveResult::Occupied(slot) => { + crate::metrics::TENANT_MANAGER.slot_removed(slot); + } + TenantsMapRemoveResult::InProgress(barrier) => { + crate::metrics::TENANT_MANAGER + .slot_removed(&TenantSlot::InProgress(barrier.clone())); + } + TenantsMapRemoveResult::Vacant => { + // Nothing changed in map, no metric update + } + } match removed { TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => { diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 06a04bf536..119df3e6c4 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -18,11 +18,19 @@ //! - An Iterator interface would be more convenient for the callers than the //! 'visit' function //! +use async_stream::try_stream; use byteorder::{ReadBytesExt, BE}; use bytes::{BufMut, Bytes, BytesMut}; use either::Either; +use futures::{Stream, StreamExt}; use hex; -use std::{cmp::Ordering, io, result}; +use std::{ + cmp::Ordering, + io, + iter::Rev, + ops::{Range, RangeInclusive}, + result, +}; use thiserror::Error; use tracing::error; @@ -36,7 +44,6 @@ use crate::{ pub const VALUE_SZ: usize = 5; pub const MAX_VALUE: u64 = 0x007f_ffff_ffff; -#[allow(dead_code)] pub const PAGE_SZ: usize = 8192; #[derive(Clone, Copy, Debug)] @@ -252,6 +259,100 @@ where Ok(result) } + pub fn iter<'a>( + &'a self, + start_key: &'a [u8; L], + ctx: &'a RequestContext, + ) -> DiskBtreeIterator<'a> { + DiskBtreeIterator { + stream: Box::pin(self.get_stream_from(start_key, ctx)), + } + } + + /// Return a stream which yields all key, value pairs from the index + /// starting from the first key greater or equal to `start_key`. + /// + /// Note that this is a copy of [`Self::visit`]. + /// TODO: Once the sequential read path is removed this will become + /// the only index traversal method. + pub fn get_stream_from<'a>( + &'a self, + start_key: &'a [u8; L], + ctx: &'a RequestContext, + ) -> impl Stream, u64), DiskBtreeError>> + 'a { + try_stream! { + let mut stack = Vec::new(); + stack.push((self.root_blk, None)); + let block_cursor = self.reader.block_cursor(); + while let Some((node_blknum, opt_iter)) = stack.pop() { + // Locate the node. + let node_buf = block_cursor + .read_blk(self.start_blk + node_blknum, ctx) + .await?; + + let node = OnDiskNode::deparse(node_buf.as_ref())?; + let prefix_len = node.prefix_len as usize; + let suffix_len = node.suffix_len as usize; + + assert!(node.num_children > 0); + + let mut keybuf = Vec::new(); + keybuf.extend(node.prefix); + keybuf.resize(prefix_len + suffix_len, 0); + + let mut iter: Either, Rev>> = if let Some(iter) = opt_iter { + iter + } else { + // Locate the first match + let idx = match node.binary_search(start_key, keybuf.as_mut_slice()) { + Ok(idx) => idx, + Err(idx) => { + if node.level == 0 { + // Imagine that the node contains the following keys: + // + // 1 + // 3 <-- idx + // 5 + // + // If the search key is '2' and there is exact match, + // the binary search would return the index of key + // '3'. That's cool, '3' is the first key to return. + idx + } else { + // This is an internal page, so each key represents a lower + // bound for what's in the child page. If there is no exact + // match, we have to return the *previous* entry. + // + // 1 <-- return this + // 3 <-- idx + // 5 + idx.saturating_sub(1) + } + } + }; + Either::Left(idx..node.num_children.into()) + }; + + // idx points to the first match now. Keep going from there + while let Some(idx) = iter.next() { + let key_off = idx * suffix_len; + let suffix = &node.keys[key_off..key_off + suffix_len]; + keybuf[prefix_len..].copy_from_slice(suffix); + let value = node.value(idx); + #[allow(clippy::collapsible_if)] + if node.level == 0 { + // leaf + yield (keybuf.clone(), value.to_u64()); + } else { + stack.push((node_blknum, Some(iter))); + stack.push((value.to_blknum(), None)); + break; + } + } + } + } + } + /// /// Scan the tree, starting from 'search_key', in the given direction. 'visitor' /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning @@ -405,6 +506,19 @@ where } } +pub struct DiskBtreeIterator<'a> { + #[allow(clippy::type_complexity)] + stream: std::pin::Pin< + Box, u64), DiskBtreeError>> + 'a>, + >, +} + +impl<'a> DiskBtreeIterator<'a> { + pub async fn next(&mut self) -> Option, u64), DiskBtreeError>> { + self.stream.next().await + } +} + /// /// Public builder object, for creating a new tree. /// @@ -701,8 +815,6 @@ impl BuildNode { #[cfg(test)] pub(crate) mod tests { use super::*; - use crate::context::DownloadBehavior; - use crate::task_mgr::TaskKind; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef}; use rand::Rng; use std::collections::BTreeMap; @@ -999,6 +1111,17 @@ pub(crate) mod tests { == all_data.get(&u128::MAX).cloned() ); + // Test iterator and get_stream API + let mut iter = reader.iter(&[0; 16], &ctx); + let mut cnt = 0; + while let Some(res) = iter.next().await { + let (key, val) = res?; + let key = u128::from_be_bytes(key.as_slice().try_into().unwrap()); + assert_eq!(val, *all_data.get(&key).unwrap()); + cnt += 1; + } + assert_eq!(cnt, all_data.len()); + Ok(()) } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 591eacd104..79cc7bf153 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -3,37 +3,32 @@ use crate::config::PageServerConf; use crate::context::RequestContext; -use crate::page_cache::{self, PAGE_SZ}; +use crate::page_cache; use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; -use crate::virtual_file::VirtualFile; +use crate::virtual_file::{self, VirtualFile}; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; -use std::cmp::min; -use std::fs::OpenOptions; -use std::io::{self, ErrorKind}; -use std::ops::DerefMut; + +use std::io; use std::sync::atomic::AtomicU64; -use tracing::*; use utils::id::TimelineId; pub struct EphemeralFile { - page_cache_file_id: page_cache::FileId, - _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, - file: VirtualFile, - len: u64, - /// An ephemeral file is append-only. - /// We keep the last page, which can still be modified, in [`Self::mutable_tail`]. - /// The other pages, which can no longer be modified, are accessed through the page cache. - mutable_tail: [u8; PAGE_SZ], + + rw: page_caching::RW, } +mod page_caching; +mod zero_padded_read_write; + impl EphemeralFile { pub async fn create( conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + ctx: &RequestContext, ) -> Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = @@ -47,22 +42,27 @@ impl EphemeralFile { let file = VirtualFile::open_with_options( &filename, - OpenOptions::new().read(true).write(true).create(true), + virtual_file::OpenOptions::new() + .read(true) + .write(true) + .create(true), + ctx, ) .await?; Ok(EphemeralFile { - page_cache_file_id: page_cache::next_file_id(), _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - file, - len: 0, - mutable_tail: [0u8; PAGE_SZ], + rw: page_caching::RW::new(file), }) } pub(crate) fn len(&self) -> u64 { - self.len + self.rw.bytes_written() + } + + pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId { + self.rw.page_cache_file_id() } pub(crate) async fn read_blk( @@ -70,39 +70,7 @@ impl EphemeralFile { blknum: u32, ctx: &RequestContext, ) -> Result { - let flushed_blknums = 0..self.len / PAGE_SZ as u64; - if flushed_blknums.contains(&(blknum as u64)) { - let cache = page_cache::get(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, ctx) - .await - .map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::Other, - // order path before error because error is anyhow::Error => might have many contexts - format!( - "ephemeral file: read immutable page #{}: {}: {:#}", - blknum, self.file.path, e, - ), - ) - })? { - page_cache::ReadBufResult::Found(guard) => { - return Ok(BlockLease::PageReadGuard(guard)) - } - page_cache::ReadBufResult::NotFound(mut write_guard) => { - let buf: &mut [u8] = write_guard.deref_mut(); - debug_assert_eq!(buf.len(), PAGE_SZ); - self.file - .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64) - .await?; - let read_guard = write_guard.mark_valid(); - return Ok(BlockLease::PageReadGuard(read_guard)); - } - }; - } else { - debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64); - Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail)) - } + self.rw.read_blk(blknum, ctx).await } pub(crate) async fn write_blob( @@ -110,122 +78,22 @@ impl EphemeralFile { srcbuf: &[u8], ctx: &RequestContext, ) -> Result { - struct Writer<'a> { - ephemeral_file: &'a mut EphemeralFile, - /// The block to which the next [`push_bytes`] will write. - blknum: u32, - /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write. - off: usize, - } - impl<'a> Writer<'a> { - fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result> { - Ok(Writer { - blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32, - off: (ephemeral_file.len % PAGE_SZ as u64) as usize, - ephemeral_file, - }) - } - #[inline(always)] - async fn push_bytes( - &mut self, - src: &[u8], - ctx: &RequestContext, - ) -> Result<(), io::Error> { - let mut src_remaining = src; - while !src_remaining.is_empty() { - let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..]; - let n = min(dst_remaining.len(), src_remaining.len()); - dst_remaining[..n].copy_from_slice(&src_remaining[..n]); - self.off += n; - src_remaining = &src_remaining[n..]; - if self.off == PAGE_SZ { - match self - .ephemeral_file - .file - .write_all_at( - &self.ephemeral_file.mutable_tail, - self.blknum as u64 * PAGE_SZ as u64, - ) - .await - { - Ok(_) => { - // Pre-warm the page cache with what we just wrote. - // This isn't necessary for coherency/correctness, but it's how we've always done it. - let cache = page_cache::get(); - match cache - .read_immutable_buf( - self.ephemeral_file.page_cache_file_id, - self.blknum, - ctx, - ) - .await - { - Ok(page_cache::ReadBufResult::Found(_guard)) => { - // This function takes &mut self, so, it shouldn't be possible to reach this point. - unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum); - } - Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => { - let buf: &mut [u8] = write_guard.deref_mut(); - debug_assert_eq!(buf.len(), PAGE_SZ); - buf.copy_from_slice(&self.ephemeral_file.mutable_tail); - let _ = write_guard.mark_valid(); - // pre-warm successful - } - Err(e) => { - error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); - // fail gracefully, it's not the end of the world if we can't pre-warm the cache here - } - } - // Zero the buffer for re-use. - // Zeroing is critical for correcntess because the write_blob code below - // and similarly read_blk expect zeroed pages. - self.ephemeral_file.mutable_tail.fill(0); - // This block is done, move to next one. - self.blknum += 1; - self.off = 0; - } - Err(e) => { - return Err(std::io::Error::new( - ErrorKind::Other, - // order error before path because path is long and error is short - format!( - "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}", - self.blknum, - e, - self.ephemeral_file.file.path, - ), - )); - } - } - } - } - Ok(()) - } - } - - let pos = self.len; - let mut writer = Writer::new(self)?; + let pos = self.rw.bytes_written(); // Write the length field if srcbuf.len() < 0x80 { // short one-byte length header let len_buf = [srcbuf.len() as u8]; - writer.push_bytes(&len_buf, ctx).await?; + + self.rw.write_all_borrowed(&len_buf, ctx).await?; } else { let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32); len_buf[0] |= 0x80; - writer.push_bytes(&len_buf, ctx).await?; + self.rw.write_all_borrowed(&len_buf, ctx).await?; } // Write the payload - writer.push_bytes(srcbuf, ctx).await?; - - if srcbuf.len() < 0x80 { - self.len += 1; - } else { - self.len += 4; - } - self.len += srcbuf.len() as u64; + self.rw.write_all_borrowed(srcbuf, ctx).await?; Ok(pos) } @@ -240,28 +108,6 @@ pub fn is_ephemeral_file(filename: &str) -> bool { } } -impl Drop for EphemeralFile { - fn drop(&mut self) { - // There might still be pages in the [`crate::page_cache`] for this file. - // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. - - // unlink the file - let res = std::fs::remove_file(&self.file.path); - if let Err(e) = res { - if e.kind() != std::io::ErrorKind::NotFound { - // just never log the not found errors, we cannot do anything for them; on detach - // the tenant directory is already gone. - // - // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 - error!( - "could not remove ephemeral file '{}': {}", - self.file.path, e - ); - } - } - } -} - impl BlockReader for EphemeralFile { fn block_cursor(&self) -> super::block_io::BlockCursor<'_> { BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self)) @@ -273,7 +119,7 @@ mod tests { use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use crate::tenant::block_io::{BlockCursor, BlockReaderRef}; + use crate::tenant::block_io::BlockReaderRef; use rand::{thread_rng, RngCore}; use std::fs; use std::str::FromStr; @@ -309,7 +155,7 @@ mod tests { async fn test_ephemeral_blobs() -> Result<(), io::Error> { let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?; + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &ctx).await?; let pos_foo = file.write_blob(b"foo", &ctx).await?; assert_eq!( diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs new file mode 100644 index 0000000000..276ac87064 --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -0,0 +1,223 @@ +//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the +//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`]. + +use crate::context::RequestContext; +use crate::page_cache::{self, PAGE_SZ}; +use crate::tenant::block_io::BlockLease; +use crate::virtual_file::VirtualFile; + +use once_cell::sync::Lazy; +use std::io::{self, ErrorKind}; +use tokio_epoll_uring::BoundedBuf; +use tracing::*; + +use super::zero_padded_read_write; + +/// See module-level comment. +pub struct RW { + page_cache_file_id: page_cache::FileId, + rw: super::zero_padded_read_write::RW, +} + +impl RW { + pub fn new(file: VirtualFile) -> Self { + let page_cache_file_id = page_cache::next_file_id(); + Self { + page_cache_file_id, + rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new( + page_cache_file_id, + file, + )), + } + } + + pub fn page_cache_file_id(&self) -> page_cache::FileId { + self.page_cache_file_id + } + + pub(crate) async fn write_all_borrowed( + &mut self, + srcbuf: &[u8], + ctx: &RequestContext, + ) -> Result { + // It doesn't make sense to proactively fill the page cache on the Pageserver write path + // because Compute is unlikely to access recently written data. + self.rw.write_all_borrowed(srcbuf, ctx).await + } + + pub(crate) fn bytes_written(&self) -> u64 { + self.rw.bytes_written() + } + + pub(crate) async fn read_blk( + &self, + blknum: u32, + ctx: &RequestContext, + ) -> Result { + match self.rw.read_blk(blknum).await? { + zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => { + let cache = page_cache::get(); + match cache + .read_immutable_buf(self.page_cache_file_id, blknum, ctx) + .await + .map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::Other, + // order path before error because error is anyhow::Error => might have many contexts + format!( + "ephemeral file: read immutable page #{}: {}: {:#}", + blknum, + self.rw.as_writer().file.path, + e, + ), + ) + })? { + page_cache::ReadBufResult::Found(guard) => { + return Ok(BlockLease::PageReadGuard(guard)) + } + page_cache::ReadBufResult::NotFound(write_guard) => { + let write_guard = writer + .file + .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx) + .await?; + let read_guard = write_guard.mark_valid(); + return Ok(BlockLease::PageReadGuard(read_guard)); + } + } + } + zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => { + Ok(BlockLease::EphemeralFileMutableTail(buffer)) + } + } + } +} + +impl Drop for RW { + fn drop(&mut self) { + // There might still be pages in the [`crate::page_cache`] for this file. + // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. + + // unlink the file + let res = std::fs::remove_file(&self.rw.as_writer().file.path); + if let Err(e) = res { + if e.kind() != std::io::ErrorKind::NotFound { + // just never log the not found errors, we cannot do anything for them; on detach + // the tenant directory is already gone. + // + // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 + error!( + "could not remove ephemeral file '{}': {}", + self.rw.as_writer().file.path, + e + ); + } + } + } +} + +struct PreWarmingWriter { + nwritten_blocks: u32, + page_cache_file_id: page_cache::FileId, + file: VirtualFile, +} + +impl PreWarmingWriter { + fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self { + Self { + nwritten_blocks: 0, + page_cache_file_id, + file, + } + } +} + +impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter { + async fn write_all< + B: tokio_epoll_uring::BoundedBuf, + Buf: tokio_epoll_uring::IoBuf + Send, + >( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)> { + let buf = buf.slice(..); + let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done + let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) { + Some(buf.to_vec()) + } else { + None + }; + let buflen = buf.len(); + assert_eq!( + buflen % PAGE_SZ, + 0, + "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used" + ); + + // Do the IO. + let iobuf = match self.file.write_all(buf, ctx).await { + (iobuf, Ok(nwritten)) => { + assert_eq!(nwritten, buflen); + iobuf + } + (_, Err(e)) => { + return Err(std::io::Error::new( + ErrorKind::Other, + // order error before path because path is long and error is short + format!( + "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}", + self.nwritten_blocks, buflen, e, self.file.path, + ), + )); + } + }; + + // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf) + let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds); + if let Some(check_bounds_stuff_works) = check_bounds_stuff_works { + assert_eq!(&check_bounds_stuff_works, &*buf); + } + + // Pre-warm page cache with the contents. + // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming + // benefits the code that writes InMemoryLayer=>L0 layers. + let nblocks = buflen / PAGE_SZ; + let nblocks32 = u32::try_from(nblocks).unwrap(); + let cache = page_cache::get(); + static CTX: Lazy = Lazy::new(|| { + RequestContext::new( + crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, + crate::context::DownloadBehavior::Error, + ) + }); + for blknum_in_buffer in 0..nblocks { + let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; + let blknum = self + .nwritten_blocks + .checked_add(blknum_in_buffer as u32) + .unwrap(); + match cache + .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) + .await + { + Err(e) => { + error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); + // fail gracefully, it's not the end of the world if we can't pre-warm the cache here + } + Ok(v) => match v { + page_cache::ReadBufResult::Found(_guard) => { + // This function takes &mut self, so, it shouldn't be possible to reach this point. + unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ + and this function takes &mut self, so, no concurrent read_blk is possible"); + } + page_cache::ReadBufResult::NotFound(mut write_guard) => { + write_guard.copy_from_slice(blk_in_buffer); + let _ = write_guard.mark_valid(); + } + }, + } + } + self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap(); + Ok((buflen, buf.into_inner())) + } +} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs new file mode 100644 index 0000000000..b37eafb52c --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -0,0 +1,130 @@ +//! The heart of how [`super::EphemeralFile`] does its reads and writes. +//! +//! # Writes +//! +//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`]. +//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`]. +//! +//! # Reads +//! +//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`]. +//! +//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer +//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`] +//! if the read is for the prefix that has already been flushed. +//! +//! # Current Usage +//! +//! The current user of this module is [`super::page_caching::RW`]. + +mod zero_padded; + +use crate::{ + context::RequestContext, + page_cache::PAGE_SZ, + virtual_file::owned_buffers_io::{ + self, + write::{Buffer, OwnedAsyncWriter}, + }, +}; + +const TAIL_SZ: usize = 64 * 1024; + +/// See module-level comment. +pub struct RW { + buffered_writer: owned_buffers_io::write::BufferedWriter< + zero_padded::Buffer, + owned_buffers_io::util::size_tracking_writer::Writer, + >, +} + +pub enum ReadResult<'a, W> { + NeedsReadFromWriter { writer: &'a W }, + ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] }, +} + +impl RW +where + W: OwnedAsyncWriter, +{ + pub fn new(writer: W) -> Self { + let bytes_flushed_tracker = + owned_buffers_io::util::size_tracking_writer::Writer::new(writer); + let buffered_writer = owned_buffers_io::write::BufferedWriter::new( + bytes_flushed_tracker, + zero_padded::Buffer::default(), + ); + Self { buffered_writer } + } + + pub(crate) fn as_writer(&self) -> &W { + self.buffered_writer.as_inner().as_inner() + } + + pub async fn write_all_borrowed( + &mut self, + buf: &[u8], + ctx: &RequestContext, + ) -> std::io::Result { + self.buffered_writer.write_buffered_borrowed(buf, ctx).await + } + + pub fn bytes_written(&self) -> u64 { + let flushed_offset = self.buffered_writer.as_inner().bytes_written(); + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + flushed_offset + u64::try_from(buffer.pending()).unwrap() + } + + pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { + let flushed_offset = self.buffered_writer.as_inner().bytes_written(); + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap(); + let read_offset = (blknum as u64) * (PAGE_SZ as u64); + + // The trailing page ("block") might only be partially filled, + // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway. + // Moreover, it has to be zero-padded, because when we still had + // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it. + // DeltaLayer probably has the same issue, not sure why it needs no special treatment. + // => check here that the read doesn't go beyond this potentially trailing + // => the zero-padding is done in the `else` branch below + let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 { + buffered_offset / (PAGE_SZ as u64) + } else { + (buffered_offset / (PAGE_SZ as u64)) + 1 + }; + if (blknum as u64) >= blocks_written { + return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}"))); + } + + // assertions for the `if-else` below + assert_eq!( + flushed_offset % (TAIL_SZ as u64), 0, + "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks" + ); + assert_eq!( + flushed_offset % (PAGE_SZ as u64), + 0, + "the logic below can't handle if the page is spread across the flushed part and the buffer" + ); + + if read_offset < flushed_offset { + assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset); + Ok(ReadResult::NeedsReadFromWriter { + writer: self.as_writer(), + }) + } else { + let read_offset_in_buffer = read_offset + .checked_sub(flushed_offset) + .expect("would have taken `if` branch instead of this one"); + let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap(); + let zero_padded_slice = buffer.as_zero_padded_slice(); + let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)]; + Ok(ReadResult::ServedFromZeroPaddedMutableTail { + buffer: page + .try_into() + .expect("the slice above got it as page-size slice"), + }) + } + } +} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs new file mode 100644 index 0000000000..f90291bbf8 --- /dev/null +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs @@ -0,0 +1,108 @@ +//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose +//! unwritten range is guaranteed to be zero-initialized. +//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`] +//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled. + +use std::mem::MaybeUninit; + +/// See module-level comment. +pub struct Buffer { + allocation: Box<[u8; N]>, + written: usize, +} + +impl Default for Buffer { + fn default() -> Self { + Self { + allocation: Box::new( + // SAFETY: zeroed memory is a valid [u8; N] + unsafe { MaybeUninit::zeroed().assume_init() }, + ), + written: 0, + } + } +} + +impl Buffer { + #[inline(always)] + fn invariants(&self) { + // don't check by default, unoptimized is too expensive even for debug mode + if false { + debug_assert!(self.written <= N, "{}", self.written); + debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0)); + } + } + + pub fn as_zero_padded_slice(&self) -> &[u8; N] { + &self.allocation + } +} + +impl crate::virtual_file::owned_buffers_io::write::Buffer for Buffer { + type IoBuf = Self; + + fn cap(&self) -> usize { + self.allocation.len() + } + + fn extend_from_slice(&mut self, other: &[u8]) { + self.invariants(); + let remaining = self.allocation.len() - self.written; + if other.len() > remaining { + panic!("calling extend_from_slice() with insufficient remaining capacity"); + } + self.allocation[self.written..(self.written + other.len())].copy_from_slice(other); + self.written += other.len(); + self.invariants(); + } + + fn pending(&self) -> usize { + self.written + } + + fn flush(self) -> tokio_epoll_uring::Slice { + self.invariants(); + let written = self.written; + tokio_epoll_uring::BoundedBuf::slice(self, 0..written) + } + + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { + let Self { + mut allocation, + written, + } = iobuf; + allocation[0..written].fill(0); + let new = Self { + allocation, + written: 0, + }; + new.invariants(); + new + } +} + +/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a +/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data. +/// +/// Remember that bytes_init is generally _not_ a tracker of the amount +/// of valid data in the io buffer; we use `Slice` for that. +/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit. +/// +/// SAFETY: +/// +/// The [`Self::allocation`] is stable becauses boxes are stable. +/// The memory is zero-initialized, so, bytes_init is always N. +unsafe impl tokio_epoll_uring::IoBuf for Buffer { + fn stable_ptr(&self) -> *const u8 { + self.allocation.as_ptr() + } + + fn bytes_init(&self) -> usize { + // Yes, N, not self.written; Read the full comment of this impl block! + N + } + + fn bytes_total(&self) -> usize { + N + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 9b6225501f..2724a5cc07 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -51,7 +51,9 @@ use crate::keyspace::KeyPartitioning; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use anyhow::Result; -use std::collections::VecDeque; +use pageserver_api::keyspace::KeySpaceAccum; +use std::collections::{HashMap, VecDeque}; +use std::iter::Peekable; use std::ops::Range; use std::sync::Arc; use utils::lsn::Lsn; @@ -144,11 +146,206 @@ impl Drop for BatchedUpdates<'_> { } /// Return value of LayerMap::search +#[derive(Eq, PartialEq, Debug, Hash)] pub struct SearchResult { pub layer: Arc, pub lsn_floor: Lsn, } +/// Return value of [`LayerMap::range_search`] +/// +/// Contains a mapping from a layer description to a keyspace +/// accumulator that contains all the keys which intersect the layer +/// from the original search space. Keys that were not found are accumulated +/// in a separate key space accumulator. +#[derive(Debug)] +pub struct RangeSearchResult { + pub found: HashMap, + pub not_found: KeySpaceAccum, +} + +impl RangeSearchResult { + fn new() -> Self { + Self { + found: HashMap::new(), + not_found: KeySpaceAccum::new(), + } + } +} + +/// Collector for results of range search queries on the LayerMap. +/// It should be provided with two iterators for the delta and image coverage +/// that contain all the changes for layers which intersect the range. +struct RangeSearchCollector +where + Iter: Iterator>)>, +{ + delta_coverage: Peekable, + image_coverage: Peekable, + key_range: Range, + end_lsn: Lsn, + + current_delta: Option>, + current_image: Option>, + + result: RangeSearchResult, +} + +#[derive(Debug)] +enum NextLayerType { + Delta(i128), + Image(i128), + Both(i128), +} + +impl NextLayerType { + fn next_change_at_key(&self) -> Key { + match self { + NextLayerType::Delta(at) => Key::from_i128(*at), + NextLayerType::Image(at) => Key::from_i128(*at), + NextLayerType::Both(at) => Key::from_i128(*at), + } + } +} + +impl RangeSearchCollector +where + Iter: Iterator>)>, +{ + fn new( + key_range: Range, + end_lsn: Lsn, + delta_coverage: Iter, + image_coverage: Iter, + ) -> Self { + Self { + delta_coverage: delta_coverage.peekable(), + image_coverage: image_coverage.peekable(), + key_range, + end_lsn, + current_delta: None, + current_image: None, + result: RangeSearchResult::new(), + } + } + + /// Run the collector. Collection is implemented via a two pointer algorithm. + /// One pointer tracks the start of the current range and the other tracks + /// the beginning of the next range which will overlap with the next change + /// in coverage across both image and delta. + fn collect(mut self) -> RangeSearchResult { + let next_layer_type = self.choose_next_layer_type(); + let mut current_range_start = match next_layer_type { + None => { + // No changes for the range + self.pad_range(self.key_range.clone()); + return self.result; + } + Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => { + // Changes only after the end of the range + self.pad_range(self.key_range.clone()); + return self.result; + } + Some(layer_type) => { + // Changes for the range exist. Record anything before the first + // coverage change as not found. + let coverage_start = layer_type.next_change_at_key(); + let range_before = self.key_range.start..coverage_start; + self.pad_range(range_before); + + self.advance(&layer_type); + coverage_start + } + }; + + while current_range_start < self.key_range.end { + let next_layer_type = self.choose_next_layer_type(); + match next_layer_type { + Some(t) => { + let current_range_end = t.next_change_at_key(); + self.add_range(current_range_start..current_range_end); + current_range_start = current_range_end; + + self.advance(&t); + } + None => { + self.add_range(current_range_start..self.key_range.end); + current_range_start = self.key_range.end; + } + } + } + + self.result + } + + /// Mark a range as not found (i.e. no layers intersect it) + fn pad_range(&mut self, key_range: Range) { + if !key_range.is_empty() { + self.result.not_found.add_range(key_range); + } + } + + /// Select the appropiate layer for the given range and update + /// the collector. + fn add_range(&mut self, covered_range: Range) { + let selected = LayerMap::select_layer( + self.current_delta.clone(), + self.current_image.clone(), + self.end_lsn, + ); + + match selected { + Some(search_result) => self + .result + .found + .entry(search_result) + .or_default() + .add_range(covered_range), + None => self.pad_range(covered_range), + } + } + + /// Move to the next coverage change. + fn advance(&mut self, layer_type: &NextLayerType) { + match layer_type { + NextLayerType::Delta(_) => { + let (_, layer) = self.delta_coverage.next().unwrap(); + self.current_delta = layer; + } + NextLayerType::Image(_) => { + let (_, layer) = self.image_coverage.next().unwrap(); + self.current_image = layer; + } + NextLayerType::Both(_) => { + let (_, image_layer) = self.image_coverage.next().unwrap(); + let (_, delta_layer) = self.delta_coverage.next().unwrap(); + + self.current_image = image_layer; + self.current_delta = delta_layer; + } + } + } + + /// Pick the next coverage change: the one at the lesser key or both if they're alligned. + fn choose_next_layer_type(&mut self) -> Option { + let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key); + let next_image_at = self.image_coverage.peek().map(|(key, _)| key); + + match (next_delta_at, next_image_at) { + (None, None) => None, + (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)), + (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)), + (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => { + Some(NextLayerType::Image(*next_image_at)) + } + (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => { + Some(NextLayerType::Delta(*next_delta_at)) + } + (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)), + } + } +} + impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given @@ -186,7 +383,18 @@ impl LayerMap { let latest_delta = version.delta_coverage.query(key.to_i128()); let latest_image = version.image_coverage.query(key.to_i128()); - match (latest_delta, latest_image) { + Self::select_layer(latest_delta, latest_image, end_lsn) + } + + fn select_layer( + delta_layer: Option>, + image_layer: Option>, + end_lsn: Lsn, + ) -> Option { + assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta())); + assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta())); + + match (delta_layer, image_layer) { (None, None) => None, (None, Some(image)) => { let lsn_floor = image.get_lsn_range().start; @@ -223,6 +431,24 @@ impl LayerMap { } } + pub fn range_search(&self, key_range: Range, end_lsn: Lsn) -> RangeSearchResult { + let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) { + Some(version) => version, + None => { + let mut result = RangeSearchResult::new(); + result.not_found.add_range(key_range); + return result; + } + }; + + let raw_range = key_range.start.to_i128()..key_range.end.to_i128(); + let delta_changes = version.delta_coverage.range_overlaps(&raw_range); + let image_changes = version.image_coverage.range_overlaps(&raw_range); + + let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes); + collector.collect() + } + /// Start a batch of updates, applied on drop pub fn batch_update(&mut self) -> BatchedUpdates<'_> { BatchedUpdates { layer_map: self } @@ -283,15 +509,15 @@ impl LayerMap { /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. - pub fn image_layer_exists(&self, key: &Range, lsn: &Range) -> Result { + pub fn image_layer_exists(&self, key: &Range, lsn: &Range) -> bool { if key.is_empty() { // Vacuously true. There's a newer image for all 0 of the kerys in the range. - return Ok(true); + return true; } let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) { Some(v) => v, - None => return Ok(false), + None => return false, }; let start = key.start.to_i128(); @@ -304,39 +530,49 @@ impl LayerMap { // Check the start is covered if !layer_covers(version.image_coverage.query(start)) { - return Ok(false); + return false; } // Check after all changes of coverage for (_, change_val) in version.image_coverage.range(start..end) { if !layer_covers(change_val) { - return Ok(false); + return false; } } - Ok(true) + true } pub fn iter_historic_layers(&self) -> impl '_ + Iterator> { self.historic.iter() } + /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. + pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option> + where + Pred: FnMut(&Arc) -> bool, + { + if let Some(open) = &self.open_layer { + if pred(open) { + return Some(open.clone()); + } + } + + self.frozen_layers.iter().rfind(|l| pred(l)).cloned() + } + /// /// Divide the whole given range of keys into sub-ranges based on the latest /// image layer that covers each range at the specified lsn (inclusive). /// This is used when creating new image layers. - /// - // FIXME: clippy complains that the result type is very complex. She's probably - // right... - #[allow(clippy::type_complexity)] pub fn image_coverage( &self, key_range: &Range, lsn: Lsn, - ) -> Result, Option>)>> { + ) -> Vec<(Range, Option>)> { let version = match self.historic.get().unwrap().get_version(lsn.0) { Some(v) => v, - None => return Ok(vec![]), + None => return vec![], }; let start = key_range.start.to_i128(); @@ -352,14 +588,14 @@ impl LayerMap { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); coverage.push((kr, current_val.take())); current_key = change_key; - current_val = change_val.clone(); + current_val.clone_from(&change_val); } // Add the final interval let kr = Key::from_i128(current_key)..Key::from_i128(end); coverage.push((kr, current_val.take())); - Ok(coverage) + coverage } pub fn is_l0(layer: &PersistentLayerDesc) -> bool { @@ -410,24 +646,19 @@ impl LayerMap { /// This number is used to compute the largest number of deltas that /// we'll need to visit for any page reconstruction in this region. /// We use this heuristic to decide whether to create an image layer. - pub fn count_deltas( - &self, - key: &Range, - lsn: &Range, - limit: Option, - ) -> Result { + pub fn count_deltas(&self, key: &Range, lsn: &Range, limit: Option) -> usize { // We get the delta coverage of the region, and for each part of the coverage // we recurse right underneath the delta. The recursion depth is limited by // the largest result this function could return, which is in practice between // 3 and 10 (since we usually try to create an image when the number gets larger). if lsn.is_empty() || key.is_empty() || limit == Some(0) { - return Ok(0); + return 0; } let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) { Some(v) => v, - None => return Ok(0), + None => return 0, }; let start = key.start.to_i128(); @@ -441,15 +672,14 @@ impl LayerMap { // Loop through the delta coverage and recurse on each part for (change_key, change_val) in version.delta_coverage.range(start..end) { // If there's a relevant delta in this part, add 1 and recurse down - if let Some(val) = current_val { + if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(change_key); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); - let max_stacked_deltas_underneath = - self.count_deltas(&kr, &lr, new_limit)?; + let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( max_stacked_deltas, base_count + max_stacked_deltas_underneath, @@ -459,19 +689,19 @@ impl LayerMap { } current_key = change_key; - current_val = change_val.clone(); + current_val.clone_from(&change_val); } // Consider the last part - if let Some(val) = current_val { + if let Some(val) = ¤t_val { if val.get_lsn_range().end > lsn.start { let kr = Key::from_i128(current_key)..Key::from_i128(end); let lr = lsn.start..val.get_lsn_range().start; if !kr.is_empty() { - let base_count = Self::is_reimage_worthy(&val, key) as usize; + let base_count = Self::is_reimage_worthy(val, key) as usize; let new_limit = limit.map(|l| l - base_count); - let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?; + let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit); max_stacked_deltas = std::cmp::max( max_stacked_deltas, base_count + max_stacked_deltas_underneath, @@ -480,7 +710,7 @@ impl LayerMap { } } - Ok(max_stacked_deltas) + max_stacked_deltas } /// Count how many reimage-worthy layers we need to visit for given key-lsn pair. @@ -592,10 +822,7 @@ impl LayerMap { if limit == Some(difficulty) { break; } - for (img_range, last_img) in self - .image_coverage(range, lsn) - .expect("why would this err?") - { + for (img_range, last_img) in self.image_coverage(range, lsn) { if limit == Some(difficulty) { break; } @@ -606,9 +833,7 @@ impl LayerMap { }; if img_lsn < lsn { - let num_deltas = self - .count_deltas(&img_range, &(img_lsn..lsn), limit) - .expect("why would this err lol?"); + let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit); difficulty = std::cmp::max(difficulty, num_deltas); } } @@ -646,3 +871,134 @@ impl LayerMap { Ok(()) } } + +#[cfg(test)] +mod tests { + use pageserver_api::keyspace::KeySpace; + + use super::*; + + #[derive(Clone)] + struct LayerDesc { + key_range: Range, + lsn_range: Range, + is_delta: bool, + } + + fn create_layer_map(layers: Vec) -> LayerMap { + let mut layer_map = LayerMap::default(); + + for layer in layers { + layer_map.insert_historic_noflush(PersistentLayerDesc::new_test( + layer.key_range, + layer.lsn_range, + layer.is_delta, + )); + } + + layer_map.flush_updates(); + layer_map + } + + fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) { + assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace()); + let lhs: HashMap = lhs + .found + .into_iter() + .map(|(search_result, accum)| (search_result, accum.to_keyspace())) + .collect(); + let rhs: HashMap = rhs + .found + .into_iter() + .map(|(search_result, accum)| (search_result, accum.to_keyspace())) + .collect(); + + assert_eq!(lhs, rhs); + } + + #[cfg(test)] + fn brute_force_range_search( + layer_map: &LayerMap, + key_range: Range, + end_lsn: Lsn, + ) -> RangeSearchResult { + let mut range_search_result = RangeSearchResult::new(); + + let mut key = key_range.start; + while key != key_range.end { + let res = layer_map.search(key, end_lsn); + match res { + Some(res) => { + range_search_result + .found + .entry(res) + .or_default() + .add_key(key); + } + None => { + range_search_result.not_found.add_key(key); + } + } + + key = key.next(); + } + + range_search_result + } + + #[test] + fn ranged_search_on_empty_layer_map() { + let layer_map = LayerMap::default(); + let range = Key::from_i128(100)..Key::from_i128(200); + + let res = layer_map.range_search(range.clone(), Lsn(100)); + assert_eq!( + res.not_found.to_keyspace(), + KeySpace { + ranges: vec![range] + } + ); + } + + #[test] + fn ranged_search() { + let layers = vec![ + LayerDesc { + key_range: Key::from_i128(15)..Key::from_i128(50), + lsn_range: Lsn(0)..Lsn(5), + is_delta: false, + }, + LayerDesc { + key_range: Key::from_i128(10)..Key::from_i128(20), + lsn_range: Lsn(5)..Lsn(20), + is_delta: true, + }, + LayerDesc { + key_range: Key::from_i128(15)..Key::from_i128(25), + lsn_range: Lsn(20)..Lsn(30), + is_delta: true, + }, + LayerDesc { + key_range: Key::from_i128(35)..Key::from_i128(40), + lsn_range: Lsn(25)..Lsn(35), + is_delta: true, + }, + LayerDesc { + key_range: Key::from_i128(35)..Key::from_i128(40), + lsn_range: Lsn(35)..Lsn(40), + is_delta: false, + }, + ]; + + let layer_map = create_layer_map(layers.clone()); + for start in 0..60 { + for end in (start + 1)..60 { + let range = Key::from_i128(start)..Key::from_i128(end); + let result = layer_map.range_search(range.clone(), Lsn(100)); + let expected = brute_force_range_search(&layer_map, range, Lsn(100)); + + assert_range_search_result_eq(result, expected); + } + } + } +} diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs index 1d9101d3d1..cf0085c071 100644 --- a/pageserver/src/tenant/layer_map/layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/layer_coverage.rs @@ -129,6 +129,42 @@ impl LayerCoverage { .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone()))) } + /// Returns an iterator which includes all coverage changes for layers that intersect + /// with the provided range. + pub fn range_overlaps( + &self, + key_range: &Range, + ) -> impl Iterator)> + '_ + where + Value: Eq, + { + let first_change = self.query(key_range.start); + match first_change { + Some(change) => { + // If the start of the range is covered, we have to deal with two cases: + // 1. Start of the range is aligned with the start of a layer. + // In this case the return of `self.range` will contain the layer which aligns with the start of the key range. + // We advance said iterator to avoid duplicating the first change. + // 2. Start of the range is not aligned with the start of a layer. + let range = key_range.start..key_range.end; + let mut range_coverage = self.range(range).peekable(); + if range_coverage + .peek() + .is_some_and(|c| c.1.as_ref() == Some(&change)) + { + range_coverage.next(); + } + itertools::Either::Left( + std::iter::once((key_range.start, Some(change))).chain(range_coverage), + ) + } + None => { + let range = key_range.start..key_range.end; + let coverage = self.range(range); + itertools::Either::Right(coverage) + } + } + } /// O(1) clone pub fn clone(&self) -> Self { Self { diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 6fb86c65e2..6ba1bdef9b 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -1,42 +1,61 @@ -//! Every image of a certain timeline from [`crate::tenant::Tenant`] -//! has a metadata that needs to be stored persistently. +//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in +//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines, +//! this struct and it's original serialization format is still needed because they were written a +//! long time ago. //! -//! Later, the file gets used in [`remote_timeline_client`] as a part of -//! external storage import and export operations. +//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json +//! versioning. //! -//! The module contains all structs and related helper methods related to timeline metadata. +//! To clean up this module we need to migrate all index_part.json files to a later version. +//! While doing this, we need to be mindful about s3 based recovery as well, so it might take +//! however long we keep the old versions to be able to delete the old code. After that, we can +//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and +//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards +//! compatibility. //! //! [`remote_timeline_client`]: super::remote_timeline_client +//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart -use std::io::{self}; - -use anyhow::{ensure, Context}; -use pageserver_api::shard::TenantShardId; -use serde::{de::Error, Deserialize, Serialize, Serializer}; -use thiserror::Error; +use anyhow::ensure; +use serde::{Deserialize, Serialize}; use utils::bin_ser::SerializeError; -use utils::crashsafe::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; -use crate::config::PageServerConf; -use crate::virtual_file::VirtualFile; -use crate::TEMP_FILE_SUFFIX; - /// Use special format number to enable backward compatibility. const METADATA_FORMAT_VERSION: u16 = 4; /// Previous supported format versions. +/// +/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming +/// that requires a scrubber run which is yet to be done. const METADATA_OLD_FORMAT_VERSION: u16 = 3; -/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. +/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic. /// /// This is the same assumption that PostgreSQL makes with the control file, +/// /// see PG_CONTROL_MAX_SAFE_SIZE const METADATA_MAX_SIZE: usize = 512; -/// Metadata stored on disk for each timeline +/// Legacy metadata stored as a component of `index_part.json` per timeline. /// -/// The fields correspond to the values we hold in memory, in Timeline. +/// Do not make new changes to this type or the module. In production, we have two different kinds +/// of serializations of this type: bincode and json. Bincode version reflects what used to be +/// stored on disk in earlier versions and does internal crc32 checksumming. +/// +/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would +/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern +/// as-exists in `index_part.json` ([`self::modern_serde`]). +/// +/// ```compile_fail +/// #[derive(serde::Serialize)] +/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata); +/// ``` +/// +/// ```compile_fail +/// #[derive(serde::Deserialize)] +/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata); +/// ``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, @@ -49,6 +68,49 @@ struct TimelineMetadataHeader { size: u16, // size of serialized metadata format_version: u16, // metadata format version (used for compatibility checks) } + +impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader { + type Error = Crc32CalculationFailed; + + fn try_from(value: &TimelineMetadataBodyV2) -> Result { + #[derive(Default)] + struct Crc32Sink { + crc: u32, + count: usize, + } + + impl std::io::Write for Crc32Sink { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.crc = crc32c::crc32c_append(self.crc, buf); + self.count += buf.len(); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } + } + + // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works + // across serialization versions + let mut sink = Crc32Sink::default(); + ::ser_into(value, &mut sink) + .map_err(Crc32CalculationFailed)?; + + let size = METADATA_HDR_SIZE + sink.count; + + Ok(TimelineMetadataHeader { + checksum: sink.crc, + size: size as u16, + format_version: METADATA_FORMAT_VERSION, + }) + } +} + +#[derive(thiserror::Error, Debug)] +#[error("re-serializing for crc32 failed")] +struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError); + const METADATA_HDR_SIZE: usize = std::mem::size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -120,6 +182,12 @@ impl TimelineMetadata { } } + #[cfg(test)] + pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result { + self.hdr = TimelineMetadataHeader::try_from(&self.body)?; + Ok(self) + } + fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result { let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; @@ -216,6 +284,24 @@ impl TimelineMetadata { self.body.ancestor_lsn } + /// When reparenting, the `ancestor_lsn` does not change. + pub fn reparent(&mut self, timeline: &TimelineId) { + assert!(self.body.ancestor_timeline.is_some()); + // no assertion for redoing this: it's fine, we may have to repeat this multiple times over + self.body.ancestor_timeline = Some(*timeline); + } + + pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { + if let Some(ancestor) = self.body.ancestor_timeline { + assert_eq!(ancestor, branchpoint.0); + } + if self.body.ancestor_lsn != Lsn(0) { + assert_eq!(self.body.ancestor_lsn, branchpoint.1); + } + self.body.ancestor_timeline = None; + self.body.ancestor_lsn = Lsn(0); + } + pub fn latest_gc_cutoff_lsn(&self) -> Lsn { self.body.latest_gc_cutoff_lsn } @@ -244,65 +330,123 @@ impl TimelineMetadata { let bytes = instance.to_bytes().unwrap(); Self::from_bytes(&bytes).unwrap() } -} -impl<'de> Deserialize<'de> for TimelineMetadata { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let bytes = Vec::::deserialize(deserializer)?; - Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}"))) + pub(crate) fn apply(&mut self, update: &MetadataUpdate) { + self.body.disk_consistent_lsn = update.disk_consistent_lsn; + self.body.prev_record_lsn = update.prev_record_lsn; + self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn; } } -impl Serialize for TimelineMetadata { - fn serialize(&self, serializer: S) -> Result +pub(crate) mod modern_serde { + use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; + use serde::{Deserialize, Serialize}; + + pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result where - S: Serializer, + D: serde::de::Deserializer<'de>, { - let bytes = self - .to_bytes() - .map_err(|e| serde::ser::Error::custom(format!("{e}")))?; - bytes.serialize(serializer) + // for legacy reasons versions 1-5 had TimelineMetadata serialized as a Vec field with + // BeSer. + struct Visitor; + + impl<'d> serde::de::Visitor<'d> for Visitor { + type Value = TimelineMetadata; + + fn expecting(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.write_str("BeSer bytes or json structure") + } + + fn visit_seq
(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'d>, + { + use serde::de::Error; + let de = serde::de::value::SeqAccessDeserializer::new(seq); + Vec::::deserialize(de) + .map(|v| TimelineMetadata::from_bytes(&v).map_err(A::Error::custom))? + } + + fn visit_map(self, map: A) -> Result + where + A: serde::de::MapAccess<'d>, + { + use serde::de::Error; + + let de = serde::de::value::MapAccessDeserializer::new(map); + let body = TimelineMetadataBodyV2::deserialize(de)?; + let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?; + + Ok(TimelineMetadata { hdr, body }) + } + } + + deserializer.deserialize_any(Visitor) + } + + pub(crate) fn serialize( + metadata: &TimelineMetadata, + serializer: S, + ) -> Result + where + S: serde::Serializer, + { + // header is not needed, upon reading we've upgraded all v1 to v2 + metadata.body.serialize(serializer) + } + + #[test] + fn deserializes_bytes_as_well_as_equivalent_body_v2() { + #[derive(serde::Deserialize, serde::Serialize)] + struct Wrapper( + #[serde(deserialize_with = "deserialize", serialize_with = "serialize")] + TimelineMetadata, + ); + + let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]"; + + let wrapper_from_bytes = serde_json::from_str::(too_many_bytes).unwrap(); + + let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap(); + + assert_eq!( + serialized, + serde_json::json! {{ + "disk_consistent_lsn": "0/149FD90", + "prev_record_lsn": "0/149FD18", + "ancestor_timeline": null, + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/149FD18", + "initdb_lsn": "0/149FD18", + "pg_version": 15 + }} + ); + + let wrapper_from_json = serde_json::value::from_value::(serialized).unwrap(); + + assert_eq!(wrapper_from_bytes.0, wrapper_from_json.0); } } -/// Save timeline metadata to file -#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))] -pub async fn save_metadata( - conf: &'static PageServerConf, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, - data: &TimelineMetadata, -) -> anyhow::Result<()> { - let path = conf.metadata_path(tenant_shard_id, timeline_id); - let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX); - let metadata_bytes = data.to_bytes().context("serialize metadata")?; - VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes) - .await - .context("write metadata")?; - Ok(()) +/// Parts of the metadata which are regularly modified. +pub(crate) struct MetadataUpdate { + disk_consistent_lsn: Lsn, + prev_record_lsn: Option, + latest_gc_cutoff_lsn: Lsn, } -#[derive(Error, Debug)] -pub enum LoadMetadataError { - #[error(transparent)] - Read(#[from] io::Error), - - #[error(transparent)] - Decode(#[from] anyhow::Error), -} - -pub fn load_metadata( - conf: &'static PageServerConf, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, -) -> Result { - let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id); - let metadata_bytes = std::fs::read(metadata_path)?; - - Ok(TimelineMetadata::from_bytes(&metadata_bytes)?) +impl MetadataUpdate { + pub(crate) fn new( + disk_consistent_lsn: Lsn, + prev_record_lsn: Option, + latest_gc_cutoff_lsn: Lsn, + ) -> Self { + Self { + disk_consistent_lsn, + prev_record_lsn, + latest_gc_cutoff_lsn, + } + } } #[cfg(test)] @@ -404,59 +548,6 @@ mod tests { ); } - #[test] - fn test_metadata_bincode_serde() { - let original_metadata = TimelineMetadata::new( - Lsn(0x200), - Some(Lsn(0x100)), - Some(TIMELINE_ID), - Lsn(0), - Lsn(0), - Lsn(0), - // Any version will do here, so use the default - crate::DEFAULT_PG_VERSION, - ); - let metadata_bytes = original_metadata - .to_bytes() - .expect("Cannot create bytes array from metadata"); - - let metadata_bincode_be_bytes = original_metadata - .ser() - .expect("Cannot serialize the metadata"); - - // 8 bytes for the length of the vector - assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len()); - - let expected_bincode_bytes = { - let mut temp = vec![]; - let len_bytes = metadata_bytes.len().to_be_bytes(); - temp.extend_from_slice(&len_bytes); - temp.extend_from_slice(&metadata_bytes); - temp - }; - assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes); - - let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap(); - // Deserialized metadata has the metadata header, which is different from the serialized one. - // Reference: TimelineMetaData::to_bytes() - let expected_metadata = { - let mut temp_metadata = original_metadata; - let body_bytes = temp_metadata - .body - .ser() - .expect("Cannot serialize the metadata body"); - let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); - let hdr = TimelineMetadataHeader { - size: metadata_size as u16, - format_version: METADATA_FORMAT_VERSION, - checksum: crc32c::crc32c(&body_bytes), - }; - temp_metadata.hdr = hdr; - temp_metadata - }; - assert_eq!(deserialized_metadata, expected_metadata); - } - #[test] fn test_metadata_bincode_serde_ensure_roundtrip() { let original_metadata = TimelineMetadata::new( @@ -470,8 +561,6 @@ mod tests { crate::DEFAULT_PG_VERSION, ); let expected_bytes = vec![ - /* bincode length encoding bytes */ - 0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector /* TimelineMetadataHeader */ 4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2) /* TimelineMetadataBodyV2 */ @@ -501,7 +590,7 @@ mod tests { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; - let metadata_ser_bytes = original_metadata.ser().unwrap(); + let metadata_ser_bytes = original_metadata.to_bytes().unwrap(); assert_eq!(metadata_ser_bytes, expected_bytes); let expected_metadata = { @@ -519,7 +608,7 @@ mod tests { temp_metadata.hdr = hdr; temp_metadata }; - let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap(); + let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap(); assert_eq!(des_metadata, expected_metadata); } } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 70b41b7b1f..4520bb9295 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2,16 +2,24 @@ //! page server. use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use hyper::StatusCode; +use itertools::Itertools; use pageserver_api::key::Key; -use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId}; +use pageserver_api::models::LocationConfigMode; +use pageserver_api::shard::{ + ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, +}; +use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; +use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap}; use std::ops::Deref; use std::sync::Arc; -use std::time::{Duration, Instant}; +use std::time::Duration; +use sysinfo::SystemExt; use tokio::fs; -use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; use anyhow::Context; use once_cell::sync::Lazy; @@ -20,7 +28,7 @@ use tokio_util::sync::CancellationToken; use tracing::*; use remote_storage::GenericRemoteStorage; -use utils::crashsafe; +use utils::{completion, crashsafe}; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -28,14 +36,17 @@ use crate::control_plane_client::{ ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError, }; use crate::deletion_queue::DeletionQueueClient; +use crate::http::routes::ACTIVE_TENANT_TIMEOUT; use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{ - AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt, + AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, }; use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; -use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState}; +use crate::tenant::storage_layer::inmemory_layer; +use crate::tenant::timeline::ShutdownMode; +use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState}; use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; @@ -44,7 +55,9 @@ use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; use super::delete::DeleteTenantError; +use super::remote_timeline_client::remote_tenant_path; use super::secondary::SecondaryTenant; +use super::timeline::detach_ancestor::PreparedTimelineDetach; use super::TenantSharedResources; /// For a tenant that appears in TenantsMap, it may either be @@ -56,6 +69,7 @@ use super::TenantSharedResources; /// that way we avoid having to carefully switch a tenant's ingestion etc on and off during /// its lifetime, and we can preserve some important safety invariants like `Tenant` always /// having a properly acquired generation (Secondary doesn't need a generation) +#[derive(Clone)] pub(crate) enum TenantSlot { Attached(Arc), Secondary(Arc), @@ -93,7 +107,7 @@ pub(crate) enum TenantsMap { /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded. /// New tenants can be added using [`tenant_map_acquire_slot`]. Open(BTreeMap), - /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`]. + /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`]. /// Existing tenants are still accessible, but no new tenants can be created. ShuttingDown(BTreeMap), } @@ -106,6 +120,7 @@ pub(crate) enum TenantsMapRemoveResult { /// When resolving a TenantId to a shard, we may be looking for the 0th /// shard, or we might be looking for whichever shard holds a particular page. +#[derive(Copy, Clone)] pub(crate) enum ShardSelector { /// Only return the 0th shard, if it is present. If a non-0th shard is present, /// ignore it. @@ -114,6 +129,56 @@ pub(crate) enum ShardSelector { First, /// Pick the shard that holds this key Page(Key), + /// The shard ID is known: pick the given shard + Known(ShardIndex), +} + +/// A convenience for use with the re_attach ControlPlaneClient function: rather +/// than the serializable struct, we build this enum that encapsulates +/// the invariant that attached tenants always have generations. +/// +/// This represents the subset of a LocationConfig that we receive during re-attach. +pub(crate) enum TenantStartupMode { + Attached((AttachmentMode, Generation)), + Secondary, +} + +impl TenantStartupMode { + /// Return the generation & mode that should be used when starting + /// this tenant. + /// + /// If this returns None, the re-attach struct is in an invalid state and + /// should be ignored in the response. + fn from_reattach_tenant(rart: ReAttachResponseTenant) -> Option { + match (rart.mode, rart.gen) { + (LocationConfigMode::Detached, _) => None, + (LocationConfigMode::Secondary, _) => Some(Self::Secondary), + (LocationConfigMode::AttachedMulti, Some(g)) => { + Some(Self::Attached((AttachmentMode::Multi, Generation::new(g)))) + } + (LocationConfigMode::AttachedSingle, Some(g)) => { + Some(Self::Attached((AttachmentMode::Single, Generation::new(g)))) + } + (LocationConfigMode::AttachedStale, Some(g)) => { + Some(Self::Attached((AttachmentMode::Stale, Generation::new(g)))) + } + _ => { + tracing::warn!( + "Received invalid re-attach state for tenant {}: {rart:?}", + rart.id + ); + None + } + } + } +} + +/// Result type for looking up a TenantId to a specific shard +pub(crate) enum ShardResolveResult { + NotFound, + Found(Arc), + // Wait for this barrrier, then query again + InProgress(utils::completion::Barrier), } impl TenantsMap { @@ -129,51 +194,6 @@ impl TenantsMap { } } - /// A page service client sends a TenantId, and to look up the correct Tenant we must - /// resolve this to a fully qualified TenantShardId. - fn resolve_attached_shard( - &self, - tenant_id: &TenantId, - selector: ShardSelector, - ) -> Option { - let mut want_shard = None; - match self { - TenantsMap::Initializing => None, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { - // Ignore all slots that don't contain an attached tenant - let tenant = match &slot.1 { - TenantSlot::Attached(t) => t, - _ => continue, - }; - - match selector { - ShardSelector::First => return Some(*slot.0), - ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { - return Some(*slot.0) - } - ShardSelector::Page(key) => { - // First slot we see for this tenant, calculate the expected shard number - // for the key: we will use this for checking if this and subsequent - // slots contain the key, rather than recalculating the hash each time. - if want_shard.is_none() { - want_shard = Some(tenant.shard_identity.get_shard_number(&key)); - } - - if Some(tenant.shard_identity.number) == want_shard { - return Some(*slot.0); - } - } - _ => continue, - } - } - - // Fall through: we didn't find an acceptable shard - None - } - } - } - /// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map. /// /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded @@ -194,6 +214,7 @@ impl TenantsMap { } } + #[cfg(all(debug_assertions, not(test)))] pub(crate) fn len(&self) -> usize { match self { TenantsMap::Initializing => 0, @@ -202,17 +223,15 @@ impl TenantsMap { } } +/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then +/// the slower actual deletion in the background. +/// /// This is "safe" in that that it won't leave behind a partially deleted directory /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting /// the contents. /// /// This is pageserver-specific, as it relies on future processes after a crash to check /// for TEMP_FILE_SUFFIX when loading things. -async fn safe_remove_tenant_dir_all(path: impl AsRef) -> std::io::Result<()> { - let tmp_path = safe_rename_tenant_dir(path).await?; - fs::remove_dir_all(tmp_path).await -} - async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result { let parent = path .as_ref() @@ -235,6 +254,28 @@ async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result> = Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing)); @@ -252,11 +293,17 @@ pub struct TenantManager { // See https://github.com/neondatabase/neon/issues/5796 tenants: &'static std::sync::RwLock, resources: TenantSharedResources, + + // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token. + // This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime), + // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or + // when the tenant detaches. + cancel: CancellationToken, } fn emergency_generations( tenant_confs: &HashMap>, -) -> HashMap { +) -> HashMap { tenant_confs .iter() .filter_map(|(tid, lc)| { @@ -264,12 +311,15 @@ fn emergency_generations( Ok(lc) => lc, Err(_) => return None, }; - let gen = match &lc.mode { - LocationMode::Attached(alc) => Some(alc.generation), - LocationMode::Secondary(_) => None, - }; - - gen.map(|g| (*tid, g)) + Some(( + *tid, + match &lc.mode { + LocationMode::Attached(alc) => { + TenantStartupMode::Attached((alc.attach_mode, alc.generation)) + } + LocationMode::Secondary(_) => TenantStartupMode::Secondary, + }, + )) }) .collect() } @@ -279,7 +329,7 @@ async fn init_load_generations( tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, -) -> anyhow::Result>> { +) -> anyhow::Result>> { let generations = if conf.control_plane_emergency_mode { error!( "Emergency mode! Tenants will be attached unsafely using their last known generation" @@ -288,8 +338,13 @@ async fn init_load_generations( } else if let Some(client) = ControlPlaneClient::new(conf, cancel) { info!("Calling control plane API to re-attach tenants"); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. - match client.re_attach().await { - Ok(tenants) => tenants, + match client.re_attach(conf).await { + Ok(tenants) => tenants + .into_iter() + .flat_map(|(id, rart)| { + TenantStartupMode::from_reattach_tenant(rart).map(|tsm| (id, tsm)) + }) + .collect(), Err(RetryForeverError::ShuttingDown) => { anyhow::bail!("Shut down while waiting for control plane re-attach response") } @@ -303,14 +358,17 @@ async fn init_load_generations( // deletion list entries may still be valid. We provide that by pushing a recovery operation into // the queue. Sequential processing of te queue ensures that recovery is done before any new tenant deletions // are processed, even though we don't block on recovery completing here. - // - // Must only do this if remote storage is enabled, otherwise deletion queue - // is not running and channel push will fail. - if resources.remote_storage.is_some() { - resources - .deletion_queue_client - .recover(generations.clone())?; - } + let attached_tenants = generations + .iter() + .flat_map(|(id, start_mode)| { + match start_mode { + TenantStartupMode::Attached((_mode, generation)) => Some(generation), + TenantStartupMode::Secondary => None, + } + .map(|gen| (*id, *gen)) + }) + .collect(); + resources.deletion_queue_client.recover(attached_tenants)?; Ok(Some(generations)) } @@ -352,12 +410,6 @@ fn load_tenant_config( return Ok(None); } - let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); - if tenant_ignore_mark_file.exists() { - info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); - return Ok(None); - } - let tenant_shard_id = match tenant_dir_path .file_name() .unwrap_or_default() @@ -370,6 +422,12 @@ fn load_tenant_config( } }; + let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME); + if tenant_ignore_mark_file.exists() { + info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant"); + return Ok(None); + } + Ok(Some(( tenant_shard_id, Tenant::load_tenant_config(conf, &tenant_shard_id), @@ -425,12 +483,23 @@ pub async fn init_tenant_mgr( let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn); + // Initialize dynamic limits that depend on system resources + let system_memory = + sysinfo::System::new_with_specifics(sysinfo::RefreshKind::new().with_memory()) + .total_memory(); + let max_ephemeral_layer_bytes = + conf.ephemeral_bytes_per_memory_kb as u64 * (system_memory / 1024); + tracing::info!("Initialized ephemeral layer size limit to {max_ephemeral_layer_bytes}, for {system_memory} bytes of memory"); + inmemory_layer::GLOBAL_RESOURCES.max_dirty_bytes.store( + max_ephemeral_layer_bytes, + std::sync::atomic::Ordering::Relaxed, + ); + // Scan local filesystem for attached tenants let tenant_configs = init_load_tenant_configs(conf).await?; - // Determine which tenants are to be attached - let tenant_generations = - init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; + // Determine which tenants are to be secondary or attached, and in which generation + let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; tracing::info!( "Attaching {} tenants at startup, warming up {} at a time", @@ -439,7 +508,11 @@ pub async fn init_tenant_mgr( ); TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64); - // Construct `Tenant` objects and start them running + // Accumulate futures for writing tenant configs, so that we can execute in parallel + let mut config_write_futs = Vec::new(); + + // Update the location configs according to the re-attach response and persist them to disk + tracing::info!("Updating {} location configs", tenant_configs.len()); for (tenant_shard_id, location_conf) in tenant_configs { let tenant_dir_path = conf.tenant_path(&tenant_shard_id); @@ -453,6 +526,7 @@ pub async fn init_tenant_mgr( TenantSlot::Attached(Tenant::create_broken_tenant( conf, tenant_shard_id, + resources.remote_storage.clone(), format!("{}", e), )), ); @@ -460,95 +534,159 @@ pub async fn init_tenant_mgr( } }; - let generation = if let Some(generations) = &tenant_generations { + // FIXME: if we were attached, and get demoted to secondary on re-attach, we + // don't have a place to get a config. + // (https://github.com/neondatabase/neon/issues/5377) + const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig = + SecondaryLocationConfig { warm: true }; + + if let Some(tenant_modes) = &tenant_modes { // We have a generation map: treat it as the authority for whether // this tenant is really attached. - if let Some(gen) = generations.get(&tenant_shard_id) { - *gen - } else { - match &location_conf.mode { - LocationMode::Secondary(secondary_config) => { - // We do not require the control plane's permission for secondary mode - // tenants, because they do no remote writes and hence require no - // generation number - info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode"); - tenants.insert( - tenant_shard_id, - TenantSlot::Secondary(SecondaryTenant::new( - tenant_shard_id, - secondary_config, - )), - ); - } - LocationMode::Attached(_) => { - // TODO: augment re-attach API to enable the control plane to - // instruct us about secondary attachments. That way, instead of throwing - // away local state, we can gracefully fall back to secondary here, if the control - // plane tells us so. - // (https://github.com/neondatabase/neon/issues/5377) - info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); - if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), - "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}", - ); - } - } - }; + match tenant_modes.get(&tenant_shard_id) { + None => { + info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response"); - continue; + match safe_rename_tenant_dir(&tenant_dir_path).await { + Ok(tmp_path) => { + spawn_background_purge(tmp_path); + } + Err(e) => { + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}"); + } + }; + + // We deleted local content: move on to next tenant, don't try and spawn this one. + continue; + } + Some(TenantStartupMode::Secondary) => { + if !matches!(location_conf.mode, LocationMode::Secondary(_)) { + location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); + } + } + Some(TenantStartupMode::Attached((attach_mode, generation))) => { + let old_gen_higher = match &location_conf.mode { + LocationMode::Attached(AttachedLocationConfig { + generation: old_generation, + attach_mode: _attach_mode, + }) => { + if old_generation > generation { + Some(old_generation) + } else { + None + } + } + _ => None, + }; + if let Some(old_generation) = old_gen_higher { + tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Control plane gave decreasing generation ({generation:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary", + old_generation + ); + + // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away + // local disk content: demote to secondary rather than detaching. + location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); + } else { + location_conf.attach_in_generation(*attach_mode, *generation); + } + } } } else { // Legacy mode: no generation information, any tenant present // on local disk may activate info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",); - Generation::none() }; // Presence of a generation number implies attachment: attach the tenant // if it wasn't already, and apply the generation number. - location_conf.attach_in_generation(generation); - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; + config_write_futs.push(async move { + let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await; + (tenant_shard_id, location_conf, r) + }); + } + // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency + tracing::info!( + "Writing {} location config files...", + config_write_futs.len() + ); + let config_write_results = futures::stream::iter(config_write_futs) + .buffer_unordered(16) + .collect::>() + .await; + + tracing::info!( + "Spawning {} tenant shard locations...", + config_write_results.len() + ); + // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running + for (tenant_shard_id, location_conf, config_write_result) in config_write_results { + // Errors writing configs are fatal + config_write_result?; + + let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let shard_identity = location_conf.shard; - match tenant_spawn( - conf, - tenant_shard_id, - &tenant_dir_path, - resources.clone(), - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - Some(init_order.clone()), - &TENANTS, - SpawnMode::Normal, - &ctx, - ) { - Ok(tenant) => { - tenants.insert(tenant_shard_id, TenantSlot::Attached(tenant)); + let slot = match location_conf.mode { + LocationMode::Attached(attached_conf) => { + match tenant_spawn( + conf, + tenant_shard_id, + &tenant_dir_path, + resources.clone(), + AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), + shard_identity, + Some(init_order.clone()), + &TENANTS, + SpawnMode::Lazy, + &ctx, + ) { + Ok(tenant) => TenantSlot::Attached(tenant), + Err(e) => { + error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); + continue; + } + } } - Err(e) => { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); + LocationMode::Secondary(secondary_conf) => { + info!( + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + "Starting secondary tenant" + ); + TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + shard_identity, + location_conf.tenant_conf, + &secondary_conf, + )) } - } + }; + + METRICS.slot_inserted(&slot); + tenants.insert(tenant_shard_id, slot); } info!("Processed {} local tenants at startup", tenants.len()); let mut tenants_map = TENANTS.write().unwrap(); assert!(matches!(&*tenants_map, &TenantsMap::Initializing)); - METRICS.tenant_slots.set(tenants.len() as u64); + *tenants_map = TenantsMap::Open(tenants); Ok(TenantManager { conf, tenants: &TENANTS, resources, + cancel: CancellationToken::new(), }) } /// Wrapper for Tenant::spawn that checks invariants before running, and inserts /// a broken tenant in the map if Tenant::spawn fails. #[allow(clippy::too_many_arguments)] -pub(crate) fn tenant_spawn( +fn tenant_spawn( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, tenant_path: &Utf8Path, @@ -581,13 +719,7 @@ pub(crate) fn tenant_spawn( "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}" ); - info!( - tenant_id = %tenant_shard_id.tenant_id, - shard_id = %tenant_shard_id.shard_slug(), - generation = ?location_conf.location.generation, - attach_mode = ?location_conf.location.attach_mode, - "Attaching tenant" - ); + let remote_storage = resources.remote_storage.clone(); let tenant = match Tenant::spawn( conf, tenant_shard_id, @@ -602,33 +734,24 @@ pub(crate) fn tenant_spawn( Ok(tenant) => tenant, Err(e) => { error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}"); - Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}")) + Tenant::create_broken_tenant(conf, tenant_shard_id, remote_storage, format!("{e:#}")) } }; Ok(tenant) } -/// -/// Shut down all tenants. This runs as part of pageserver shutdown. -/// -/// NB: We leave the tenants in the map, so that they remain accessible through -/// the management API until we shut it down. If we removed the shut-down tenants -/// from the tenants map, the management API would return 404 for these tenants, -/// because TenantsMap::get() now returns `None`. -/// That could be easily misinterpreted by control plane, the consumer of the -/// management API. For example, it could attach the tenant on a different pageserver. -/// We would then be in split-brain once this pageserver restarts. -#[instrument(skip_all)] -pub(crate) async fn shutdown_all_tenants() { - shutdown_all_tenants0(&TENANTS).await -} - async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { - use utils::completion; - let mut join_set = JoinSet::new(); + #[cfg(all(debug_assertions, not(test)))] + { + // Check that our metrics properly tracked the size of the tenants map. This is a convenient location to check, + // as it happens implicitly at the end of tests etc. + let m = tenants.read().unwrap(); + debug_assert_eq!(METRICS.slots_total(), m.len() as u64); + } + // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants. let (total_in_progress, total_attached) = { let mut m = tenants.write().unwrap(); @@ -649,11 +772,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone())); join_set.spawn( async move { - let freeze_and_flush = true; - let res = { let (_guard, shutdown_progress) = completion::channel(); - t.shutdown(shutdown_progress, freeze_and_flush).await + t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await }; if let Err(other_progress) = res { @@ -665,7 +786,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { // going to log too many lines debug!("tenant successfully stopped"); } - .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())), + .instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())), ); total_attached += 1; @@ -717,7 +838,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { tokio::select! { Some(joined) = join_set.join_next() => { match joined { - Ok(()) => {} + Ok(()) => {}, Err(join_error) if join_error.is_cancelled() => { unreachable!("we are not cancelling any of the tasks"); } @@ -754,74 +875,22 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { // caller will log how long we took } -pub(crate) async fn create_tenant( - conf: &'static PageServerConf, - tenant_conf: TenantConfOpt, - tenant_shard_id: TenantShardId, - generation: Generation, - resources: TenantSharedResources, - ctx: &RequestContext, -) -> Result, TenantMapInsertError> { - let location_conf = LocationConf::attached_single(tenant_conf, generation); - info!("Creating tenant at location {location_conf:?}"); +#[derive(thiserror::Error, Debug)] +pub(crate) enum UpsertLocationError { + #[error("Bad config request: {0}")] + BadRequest(anyhow::Error), - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?; + #[error("Cannot change config in this state: {0}")] + Unavailable(#[from] TenantMapError), - let shard_identity = location_conf.shard; - let created_tenant = tenant_spawn( - conf, - tenant_shard_id, - &tenant_path, - resources, - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - None, - &TENANTS, - SpawnMode::Create, - ctx, - )?; - // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. - // See https://github.com/neondatabase/neon/issues/4233 + #[error("Tenant is already being modified")] + InProgress, - let created_tenant_id = created_tenant.tenant_id(); - debug_assert_eq!(created_tenant_id, tenant_shard_id.tenant_id); + #[error("Failed to flush: {0}")] + Flush(anyhow::Error), - slot_guard.upsert(TenantSlot::Attached(created_tenant.clone()))?; - - Ok(created_tenant) -} - -#[derive(Debug, thiserror::Error)] -pub(crate) enum SetNewTenantConfigError { - #[error(transparent)] - GetTenant(#[from] GetTenantError), - #[error(transparent)] - Persist(anyhow::Error), -} - -pub(crate) async fn set_new_tenant_config( - conf: &'static PageServerConf, - new_tenant_conf: TenantConfOpt, - tenant_id: TenantId, -) -> Result<(), SetNewTenantConfigError> { - // Legacy API: does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - info!("configuring tenant {tenant_id}"); - let tenant = get_tenant(tenant_shard_id, true)?; - - // This is a legacy API that only operates on attached tenants: the preferred - // API to use is the location_config/ endpoint, which lets the caller provide - // the full LocationConf. - let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation); - - Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf) - .await - .map_err(SetNewTenantConfigError::Persist)?; - tenant.set_new_tenant_config(new_tenant_conf); - Ok(()) + #[error("Internal error: {0}")] + Other(#[from] anyhow::Error), } impl TenantManager { @@ -831,35 +900,22 @@ impl TenantManager { self.conf } - /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query. - /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. + /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently + /// undergoing a state change (i.e. slot is InProgress). + /// + /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or + /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it. pub(crate) fn get_attached_tenant_shard( &self, tenant_shard_id: TenantShardId, - active_only: bool, ) -> Result, GetTenantError> { let locked = self.tenants.read().unwrap(); let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; match peek_slot { - Some(TenantSlot::Attached(tenant)) => match tenant.current_state() { - TenantState::Broken { - reason, - backtrace: _, - } if active_only => Err(GetTenantError::Broken(reason)), - TenantState::Active => Ok(Arc::clone(tenant)), - _ => { - if active_only { - Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) - } else { - Ok(Arc::clone(tenant)) - } - } - }, - Some(TenantSlot::InProgress(_)) => { - Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) - } + Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)), + Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)), None | Some(TenantSlot::Secondary(_)) => { Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) } @@ -882,14 +938,26 @@ impl TenantManager { } } + /// Whether the `TenantManager` is responsible for the tenant shard + pub(crate) fn manages_tenant_shard(&self, tenant_shard_id: TenantShardId) -> bool { + let locked = self.tenants.read().unwrap(); + + let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) + .ok() + .flatten(); + + peek_slot.is_some() + } + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(crate) async fn upsert_location( &self, tenant_shard_id: TenantShardId, new_location_config: LocationConf, flush: Option, + mut spawn_mode: SpawnMode, ctx: &RequestContext, - ) -> Result<(), anyhow::Error> { + ) -> Result>, UpsertLocationError> { debug_assert_current_span_has_tenant_id(); info!("configuring tenant location to state {new_location_config:?}"); @@ -907,18 +975,29 @@ impl TenantManager { tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?; match (&new_location_config.mode, peek_slot) { (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => { - if attach_conf.generation == tenant.generation { - // A transition from Attached to Attached in the same generation, we may - // take our fast path and just provide the updated configuration - // to the tenant. - tenant.set_new_location_config(AttachedTenantConf::try_from( - new_location_config.clone(), - )?); + match attach_conf.generation.cmp(&tenant.generation) { + Ordering::Equal => { + // A transition from Attached to Attached in the same generation, we may + // take our fast path and just provide the updated configuration + // to the tenant. + tenant.set_new_location_config( + AttachedTenantConf::try_from(new_location_config.clone()) + .map_err(UpsertLocationError::BadRequest)?, + ); - Some(FastPathModified::Attached(tenant.clone())) - } else { - // Different generations, fall through to general case - None + Some(FastPathModified::Attached(tenant.clone())) + } + Ordering::Less => { + return Err(UpsertLocationError::BadRequest(anyhow::anyhow!( + "Generation {:?} is less than existing {:?}", + attach_conf.generation, + tenant.generation + ))); + } + Ordering::Greater => { + // Generation advanced, fall through to general case of replacing `Tenant` object + None + } } } ( @@ -926,6 +1005,7 @@ impl TenantManager { Some(TenantSlot::Secondary(secondary_tenant)), ) => { secondary_tenant.set_config(secondary_conf); + secondary_tenant.set_tenant_conf(&new_location_config.tenant_conf); Some(FastPathModified::Secondary(secondary_tenant.clone())) } _ => { @@ -940,8 +1020,7 @@ impl TenantManager { match fast_path_taken { Some(FastPathModified::Attached(tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; + .await?; // Transition to AttachedStale means we may well hold a valid generation // still, and have been requested to go stale as part of a migration. If @@ -954,9 +1033,9 @@ impl TenantManager { if let Some(flush_timeout) = flush { match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await { Ok(Err(e)) => { - return Err(e); + return Err(UpsertLocationError::Flush(e)); } - Ok(Ok(_)) => return Ok(()), + Ok(Ok(_)) => return Ok(Some(tenant)), Err(_) => { tracing::warn!( timeout_ms = flush_timeout.as_millis(), @@ -967,14 +1046,13 @@ impl TenantManager { } } - return Ok(()); + return Ok(Some(tenant)); } Some(FastPathModified::Secondary(_secondary_tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; + .await?; - return Ok(()); + return Ok(None); } None => { // Proceed with the general case procedure, where we will shutdown & remove any existing @@ -987,7 +1065,14 @@ impl TenantManager { // the tenant is inaccessible to the outside world while we are doing this, but that is sensible: // the state is ill-defined while we're in transition. Transitions are async, but fast: we do // not do significant I/O, and shutdowns should be prompt via cancellation tokens. - let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any) + .map_err(|e| match e { + TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => { + unreachable!("Called with mode Any") + } + TenantSlotError::InProgress => UpsertLocationError::InProgress, + TenantSlotError::MapState(s) => UpsertLocationError::Unavailable(s), + })?; match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { @@ -1009,7 +1094,7 @@ impl TenantManager { }; info!("Shutting down attached tenant"); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(barrier) => { info!("Shutdown already in progress, waiting for it to complete"); @@ -1017,6 +1102,12 @@ impl TenantManager { } } slot_guard.drop_old_value().expect("We just shut it down"); + + // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then + // the caller thinks they're creating but the tenant already existed. We must switch to + // Eager mode so that when starting this Tenant we properly probe remote storage for timelines, + // rather than assuming it to be empty. + spawn_mode = SpawnMode::Eager; } Some(TenantSlot::Secondary(state)) => { info!("Shutting down secondary tenant"); @@ -1025,7 +1116,9 @@ impl TenantManager { Some(TenantSlot::InProgress(_)) => { // This should never happen: acquire_slot should error out // if the contents of a slot were InProgress. - anyhow::bail!("Acquired an InProgress slot, this is a bug.") + return Err(UpsertLocationError::Other(anyhow::anyhow!( + "Acquired an InProgress slot, this is a bug." + ))); } None => { // Slot was vacant, nothing needs shutting down. @@ -1047,26 +1140,44 @@ impl TenantManager { // Before activating either secondary or attached mode, persist the // configuration, so that on restart we will re-attach (or re-start // secondary) on the tenant. - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await - .map_err(SetNewTenantConfigError::Persist)?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?; let new_slot = match &new_location_config.mode { LocationMode::Secondary(secondary_config) => { - TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config)) + let shard_identity = new_location_config.shard; + TenantSlot::Secondary(SecondaryTenant::new( + tenant_shard_id, + shard_identity, + new_location_config.tenant_conf, + secondary_config, + )) } LocationMode::Attached(_attach_config) => { let shard_identity = new_location_config.shard; + + // Testing hack: if we are configured with no control plane, then drop the generation + // from upserts. This enables creating generation-less tenants even though neon_local + // always uses generations when calling the location conf API. + let attached_conf = if cfg!(feature = "testing") { + let mut conf = AttachedTenantConf::try_from(new_location_config)?; + if self.conf.control_plane_api.is_none() { + conf.location.generation = Generation::none(); + } + conf + } else { + AttachedTenantConf::try_from(new_location_config)? + }; + let tenant = tenant_spawn( self.conf, tenant_shard_id, &tenant_path, self.resources.clone(), - AttachedTenantConf::try_from(new_location_config)?, + attached_conf, shard_identity, None, self.tenants, - SpawnMode::Normal, + spawn_mode, ctx, )?; @@ -1074,9 +1185,52 @@ impl TenantManager { } }; - slot_guard.upsert(new_slot)?; + let attached_tenant = if let TenantSlot::Attached(tenant) = &new_slot { + Some(tenant.clone()) + } else { + None + }; - Ok(()) + match slot_guard.upsert(new_slot) { + Err(TenantSlotUpsertError::InternalError(e)) => { + Err(UpsertLocationError::Other(anyhow::anyhow!(e))) + } + Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)), + Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => { + // If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then + // we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants + // are shutdown. + // + // We must shut it down inline here. + match new_slot { + TenantSlot::InProgress(_) => { + // Unreachable because we never insert an InProgress + unreachable!() + } + TenantSlot::Attached(tenant) => { + let (_guard, progress) = utils::completion::channel(); + info!("Shutting down just-spawned tenant, because tenant manager is shut down"); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + info!("Finished shutting down just-spawned tenant"); + } + Err(barrier) => { + info!("Shutdown already in progress, waiting for it to complete"); + barrier.wait().await; + } + } + } + TenantSlot::Secondary(secondary_tenant) => { + secondary_tenant.shutdown().await; + } + } + + Err(UpsertLocationError::Unavailable( + TenantMapError::ShuttingDown, + )) + } + Ok(()) => Ok(attached_tenant), + } } /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same @@ -1093,7 +1247,7 @@ impl TenantManager { &self, tenant_shard_id: TenantShardId, drop_cache: bool, - ctx: RequestContext, + ctx: &RequestContext, ) -> anyhow::Result<()> { let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; let Some(old_slot) = slot_guard.get_old_value() else { @@ -1106,7 +1260,7 @@ impl TenantManager { }; let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { slot_guard.drop_old_value()?; } @@ -1145,8 +1299,8 @@ impl TenantManager { shard_identity, None, self.tenants, - SpawnMode::Normal, - &ctx, + SpawnMode::Eager, + ctx, )?; slot_guard.upsert(TenantSlot::Attached(tenant))?; @@ -1192,11 +1346,33 @@ impl TenantManager { } } + /// Total list of all tenant slots: this includes attached, secondary, and InProgress. + pub(crate) fn list(&self) -> Vec<(TenantShardId, TenantSlot)> { + let locked = self.tenants.read().unwrap(); + match &*locked { + TenantsMap::Initializing => Vec::new(), + TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => { + map.iter().map(|(k, v)| (*k, v.clone())).collect() + } + } + } + + pub(crate) fn get(&self, tenant_shard_id: TenantShardId) -> Option { + let locked = self.tenants.read().unwrap(); + match &*locked { + TenantsMap::Initializing => None, + TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => { + map.get(&tenant_shard_id).cloned() + } + } + } + pub(crate) async fn delete_tenant( &self, tenant_shard_id: TenantShardId, activation_timeout: Duration, - ) -> Result<(), DeleteTenantError> { + ) -> Result { + super::span::debug_assert_current_span_has_tenant_id(); // We acquire a SlotGuard during this function to protect against concurrent // changes while the ::prepare phase of DeleteTenantFlow executes, but then // have to return the Tenant to the map while the background deletion runs. @@ -1208,30 +1384,98 @@ impl TenantManager { // // See https://github.com/neondatabase/neon/issues/5080 - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?; + // Tenant deletion can happen two ways: + // - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping + // state until deletion is complete. + // - New: called on a pageserver without an attached location. We proceed with deletion from + // remote storage. + // + // See https://github.com/neondatabase/neon/issues/5080 for more context on this transition. - // unwrap is safe because we used MustExist mode when acquiring - let tenant = match slot_guard.get_old_value().as_ref().unwrap() { - TenantSlot::Attached(tenant) => tenant.clone(), - _ => { - // Express "not attached" as equivalent to "not found" - return Err(DeleteTenantError::NotAttached); + let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + match &slot_guard.old_value { + Some(TenantSlot::Attached(tenant)) => { + // Legacy deletion flow: the tenant remains attached, goes to Stopping state, and + // deletion will be resumed across restarts. + let tenant = tenant.clone(); + return self + .delete_tenant_attached(slot_guard, tenant, activation_timeout) + .await; } + Some(TenantSlot::Secondary(secondary_tenant)) => { + secondary_tenant.shutdown().await; + let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id); + let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| { + format!("local tenant directory {local_tenant_directory:?} rename") + })?; + spawn_background_purge(tmp_dir); + } + Some(TenantSlot::InProgress(_)) => unreachable!(), + None => {} }; + // Fall through: local state for this tenant is no longer present, proceed with remote delete + let remote_path = remote_tenant_path(&tenant_shard_id); + let keys = match self + .resources + .remote_storage + .list( + Some(&remote_path), + remote_storage::ListingMode::NoDelimiter, + None, + &self.cancel, + ) + .await + { + Ok(listing) => listing.keys, + Err(remote_storage::DownloadError::Cancelled) => { + return Err(DeleteTenantError::Cancelled) + } + Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND), + Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))), + }; + + if keys.is_empty() { + tracing::info!("Remote storage already deleted"); + } else { + tracing::info!("Deleting {} keys from remote storage", keys.len()); + self.resources + .remote_storage + .delete_objects(&keys, &self.cancel) + .await?; + } + + // Callers use 404 as success for deletions, for historical reasons. + Ok(StatusCode::NOT_FOUND) + } + + async fn delete_tenant_attached( + &self, + slot_guard: SlotGuard, + tenant: Arc, + activation_timeout: Duration, + ) -> Result { match tenant.current_state() { TenantState::Broken { .. } | TenantState::Stopping { .. } => { - // If a tenant is broken or stopping, DeleteTenantFlow can - // handle it: broken tenants proceed to delete, stopping tenants - // are checked for deletion already in progress. + // If deletion is already in progress, return success (the semantics of this + // function are to rerturn success afterr deletion is spawned in background). + // Otherwise fall through and let [`DeleteTenantFlow`] handle this state. + if DeleteTenantFlow::is_in_progress(&tenant) { + // The `delete_progress` lock is held: deletion is already happening + // in the bacckground + slot_guard.revert(); + return Ok(StatusCode::ACCEPTED); + } } _ => { tenant .wait_to_become_active(activation_timeout) .await .map_err(|e| match e { - GetActiveTenantError::WillNotBecomeActive(_) => { + GetActiveTenantError::WillNotBecomeActive(_) + | GetActiveTenantError::Broken(_) => { DeleteTenantError::InvalidState(tenant.current_state()) } GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled, @@ -1249,68 +1493,683 @@ impl TenantManager { self.resources.remote_storage.clone(), &TENANTS, tenant, + &self.cancel, ) .await; // The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow slot_guard.revert(); - result + let () = result?; + Ok(StatusCode::ACCEPTED) + } + + #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))] + pub(crate) async fn shard_split( + &self, + tenant: Arc, + new_shard_count: ShardCount, + new_stripe_size: Option, + ctx: &RequestContext, + ) -> anyhow::Result> { + let tenant_shard_id = *tenant.get_tenant_shard_id(); + let r = self + .do_shard_split(tenant, new_shard_count, new_stripe_size, ctx) + .await; + if r.is_err() { + // Shard splitting might have left the original shard in a partially shut down state (it + // stops the shard's remote timeline client). Reset it to ensure we leave things in + // a working state. + if self.get(tenant_shard_id).is_some() { + tracing::warn!("Resetting after shard split failure"); + if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await { + // Log this error because our return value will still be the original error, not this one. This is + // a severe error: if this happens, we might be leaving behind a tenant that is not fully functional + // (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or + // setting it broken probably won't help either. + tracing::error!("Failed to reset: {e}"); + } + } + } + + r + } + + pub(crate) async fn do_shard_split( + &self, + tenant: Arc, + new_shard_count: ShardCount, + new_stripe_size: Option, + ctx: &RequestContext, + ) -> anyhow::Result> { + let tenant_shard_id = *tenant.get_tenant_shard_id(); + + // Validate the incoming request + if new_shard_count.count() <= tenant_shard_id.shard_count.count() { + anyhow::bail!("Requested shard count is not an increase"); + } + let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count(); + if !expansion_factor.is_power_of_two() { + anyhow::bail!("Requested split is not a power of two"); + } + + if let Some(new_stripe_size) = new_stripe_size { + if tenant.get_shard_stripe_size() != new_stripe_size + && tenant_shard_id.shard_count.count() > 1 + { + // This tenant already has multiple shards, it is illegal to try and change its stripe size + anyhow::bail!( + "Shard stripe size may not be modified once tenant has multiple shards" + ); + } + } + + // Plan: identify what the new child shards will be + let child_shards = tenant_shard_id.split(new_shard_count); + tracing::info!( + "Shard {} splits into: {}", + tenant_shard_id.to_index(), + child_shards + .iter() + .map(|id| format!("{}", id.to_index())) + .join(",") + ); + + fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + let parent_shard_identity = tenant.shard_identity; + let parent_tenant_conf = tenant.get_tenant_conf(); + let parent_generation = tenant.generation; + + // Phase 1: Write out child shards' remote index files, in the parent tenant's current generation + if let Err(e) = tenant.split_prepare(&child_shards).await { + // If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might + // have been left in a partially-shut-down state. + tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning"); + return Err(e); + } + + fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + self.resources.deletion_queue_client.flush_advisory(); + + // Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant + drop(tenant); + let mut parent_slot_guard = + tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let parent = match parent_slot_guard.get_old_value() { + Some(TenantSlot::Attached(t)) => t, + Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"), + Some(TenantSlot::InProgress(_)) => { + // tenant_map_acquire_slot never returns InProgress, if a slot was InProgress + // it would return an error. + unreachable!() + } + None => { + // We don't actually need the parent shard to still be attached to do our work, but it's + // a weird enough situation that the caller probably didn't want us to continue working + // if they had detached the tenant they requested the split on. + anyhow::bail!("Detached parent shard in the middle of split!") + } + }; + fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + // Optimization: hardlink layers from the parent into the children, so that they don't have to + // re-download & duplicate the data referenced in their initial IndexPart + self.shard_split_hardlink(parent, child_shards.clone()) + .await?; + fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + // Take a snapshot of where the parent's WAL ingest had got to: we will wait for + // child shards to reach this point. + let mut target_lsns = HashMap::new(); + for timeline in parent.timelines.lock().unwrap().clone().values() { + target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn()); + } + + // TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources + // and could slow down the children trying to catch up. + + // Phase 3: Spawn the child shards + for child_shard in &child_shards { + let mut child_shard_identity = parent_shard_identity; + if let Some(new_stripe_size) = new_stripe_size { + child_shard_identity.stripe_size = new_stripe_size; + } + child_shard_identity.count = child_shard.shard_count; + child_shard_identity.number = child_shard.shard_number; + + let child_location_conf = LocationConf { + mode: LocationMode::Attached(AttachedLocationConfig { + generation: parent_generation, + attach_mode: AttachmentMode::Single, + }), + shard: child_shard_identity, + tenant_conf: parent_tenant_conf.clone(), + }; + + self.upsert_location( + *child_shard, + child_location_conf, + None, + SpawnMode::Eager, + ctx, + ) + .await?; + } + + fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + // Phase 4: wait for child chards WAL ingest to catch up to target LSN + for child_shard_id in &child_shards { + let child_shard_id = *child_shard_id; + let child_shard = { + let locked = TENANTS.read().unwrap(); + let peek_slot = + tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?; + peek_slot.and_then(|s| s.get_attached()).cloned() + }; + if let Some(t) = child_shard { + // Wait for the child shard to become active: this should be very quick because it only + // has to download the index_part that we just uploaded when creating it. + if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await { + // This is not fatal: we have durably created the child shard. It just makes the + // split operation less seamless for clients, as we will may detach the parent + // shard before the child shards are fully ready to serve requests. + tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}"); + continue; + } + + let timelines = t.timelines.lock().unwrap().clone(); + for timeline in timelines.values() { + let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else { + continue; + }; + + tracing::info!( + "Waiting for child shard {}/{} to reach target lsn {}...", + child_shard_id, + timeline.timeline_id, + target_lsn + ); + + fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + if let Err(e) = timeline + .wait_lsn( + *target_lsn, + crate::tenant::timeline::WaitLsnWaiter::Tenant, + ctx, + ) + .await + { + // Failure here might mean shutdown, in any case this part is an optimization + // and we shouldn't hold up the split operation. + tracing::warn!( + "Failed to wait for timeline {} to reach lsn {target_lsn}: {e}", + timeline.timeline_id + ); + } else { + tracing::info!( + "Child shard {}/{} reached target lsn {}", + child_shard_id, + timeline.timeline_id, + target_lsn + ); + } + } + } + } + + // Phase 5: Shut down the parent shard, and erase it from disk + let (_guard, progress) = completion::channel(); + match parent.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => {} + Err(other) => { + other.wait().await; + } + } + let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id); + let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; + spawn_background_purge(tmp_path); + + fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!( + "failpoint" + ))); + + parent_slot_guard.drop_old_value()?; + + // Phase 6: Release the InProgress on the parent shard + drop(parent_slot_guard); + + Ok(child_shards) + } + + /// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization + /// to avoid the children downloading them again. + /// + /// For each resident layer in the parent shard, we will hard link it into all of the child shards. + async fn shard_split_hardlink( + &self, + parent_shard: &Tenant, + child_shards: Vec, + ) -> anyhow::Result<()> { + debug_assert_current_span_has_tenant_id(); + + let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id()); + let (parent_timelines, parent_layers) = { + let mut parent_layers = Vec::new(); + let timelines = parent_shard.timelines.lock().unwrap().clone(); + let parent_timelines = timelines.keys().cloned().collect::>(); + for timeline in timelines.values() { + let timeline_layers = timeline + .layers + .read() + .await + .likely_resident_layers() + .collect::>(); + + for layer in timeline_layers { + let relative_path = layer + .local_path() + .strip_prefix(&parent_path) + .context("Removing prefix from parent layer path")?; + parent_layers.push(relative_path.to_owned()); + } + } + debug_assert!( + !parent_layers.is_empty(), + "shutdown cannot empty the layermap" + ); + (parent_timelines, parent_layers) + }; + + let mut child_prefixes = Vec::new(); + let mut create_dirs = Vec::new(); + + for child in child_shards { + let child_prefix = self.conf.tenant_path(&child); + create_dirs.push(child_prefix.clone()); + create_dirs.extend( + parent_timelines + .iter() + .map(|t| self.conf.timeline_path(&child, t)), + ); + + child_prefixes.push(child_prefix); + } + + // Since we will do a large number of small filesystem metadata operations, batch them into + // spawn_blocking calls rather than doing each one as a tokio::fs round-trip. + let jh = tokio::task::spawn_blocking(move || -> anyhow::Result { + for dir in &create_dirs { + if let Err(e) = std::fs::create_dir_all(dir) { + // Ignore AlreadyExists errors, drop out on all other errors + match e.kind() { + std::io::ErrorKind::AlreadyExists => {} + _ => { + return Err(anyhow::anyhow!(e).context(format!("Creating {dir}"))); + } + } + } + } + + for child_prefix in child_prefixes { + for relative_layer in &parent_layers { + let parent_path = parent_path.join(relative_layer); + let child_path = child_prefix.join(relative_layer); + if let Err(e) = std::fs::hard_link(&parent_path, &child_path) { + match e.kind() { + std::io::ErrorKind::AlreadyExists => {} + std::io::ErrorKind::NotFound => { + tracing::info!( + "Layer {} not found during hard-linking, evicted during split?", + relative_layer + ); + } + _ => { + return Err(anyhow::anyhow!(e).context(format!( + "Hard linking {relative_layer} into {child_prefix}" + ))) + } + } + } + } + } + + // Durability is not required for correctness, but if we crashed during split and + // then came restarted with empty timeline dirs, it would be very inefficient to + // re-populate from remote storage. + for dir in create_dirs { + if let Err(e) = crashsafe::fsync(&dir) { + // Something removed a newly created timeline dir out from underneath us? Extremely + // unexpected, but not worth panic'ing over as this whole function is just an + // optimization. + tracing::warn!("Failed to fsync directory {dir}: {e}") + } + } + + Ok(parent_layers.len()) + }); + + match jh.await { + Ok(Ok(layer_count)) => { + tracing::info!(count = layer_count, "Hard linked layers into child shards"); + } + Ok(Err(e)) => { + // This is an optimization, so we tolerate failure. + tracing::warn!("Error hard-linking layers, proceeding anyway: {e}") + } + Err(e) => { + // This is something totally unexpected like a panic, so bail out. + anyhow::bail!("Error joining hard linking task: {e}"); + } + } + + Ok(()) + } + + /// + /// Shut down all tenants. This runs as part of pageserver shutdown. + /// + /// NB: We leave the tenants in the map, so that they remain accessible through + /// the management API until we shut it down. If we removed the shut-down tenants + /// from the tenants map, the management API would return 404 for these tenants, + /// because TenantsMap::get() now returns `None`. + /// That could be easily misinterpreted by control plane, the consumer of the + /// management API. For example, it could attach the tenant on a different pageserver. + /// We would then be in split-brain once this pageserver restarts. + #[instrument(skip_all)] + pub(crate) async fn shutdown(&self) { + self.cancel.cancel(); + + shutdown_all_tenants0(self.tenants).await + } + + pub(crate) async fn detach_tenant( + &self, + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + detach_ignored: bool, + deletion_queue_client: &DeletionQueueClient, + ) -> Result<(), TenantStateError> { + let tmp_path = self + .detach_tenant0( + conf, + &TENANTS, + tenant_shard_id, + detach_ignored, + deletion_queue_client, + ) + .await?; + spawn_background_purge(tmp_path); + + Ok(()) + } + + async fn detach_tenant0( + &self, + conf: &'static PageServerConf, + tenants: &std::sync::RwLock, + tenant_shard_id: TenantShardId, + detach_ignored: bool, + deletion_queue_client: &DeletionQueueClient, + ) -> Result { + let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { + let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); + safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| { + format!("local tenant directory {local_tenant_directory:?} rename") + }) + }; + + let removal_result = remove_tenant_from_memory( + tenants, + tenant_shard_id, + tenant_dir_rename_operation(tenant_shard_id), + ) + .await; + + // Flush pending deletions, so that they have a good chance of passing validation + // before this tenant is potentially re-attached elsewhere. + deletion_queue_client.flush_advisory(); + + // Ignored tenants are not present in memory and will bail the removal from memory operation. + // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. + if detach_ignored + && matches!( + removal_result, + Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) + ) + { + let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); + if tenant_ignore_mark.exists() { + info!("Detaching an ignored tenant"); + let tmp_path = tenant_dir_rename_operation(tenant_shard_id) + .await + .with_context(|| { + format!("Ignored tenant {tenant_shard_id} local directory rename") + })?; + return Ok(tmp_path); + } + } + + removal_result + } + + pub(crate) fn list_tenants( + &self, + ) -> Result, TenantMapListError> { + let tenants = TENANTS.read().unwrap(); + let m = match &*tenants { + TenantsMap::Initializing => return Err(TenantMapListError::Initializing), + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, + }; + Ok(m.iter() + .filter_map(|(id, tenant)| match tenant { + TenantSlot::Attached(tenant) => { + Some((*id, tenant.current_state(), tenant.generation())) + } + TenantSlot::Secondary(_) => None, + TenantSlot::InProgress(_) => None, + }) + .collect()) + } + + /// Completes an earlier prepared timeline detach ancestor. + pub(crate) async fn complete_detaching_timeline_ancestor( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + prepared: PreparedTimelineDetach, + ctx: &RequestContext, + ) -> Result, anyhow::Error> { + struct RevertOnDropSlot(Option); + + impl Drop for RevertOnDropSlot { + fn drop(&mut self) { + if let Some(taken) = self.0.take() { + taken.revert(); + } + } + } + + impl RevertOnDropSlot { + fn into_inner(mut self) -> SlotGuard { + self.0.take().unwrap() + } + } + + impl std::ops::Deref for RevertOnDropSlot { + type Target = SlotGuard; + + fn deref(&self) -> &Self::Target { + self.0.as_ref().unwrap() + } + } + + let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?; + let slot_guard = RevertOnDropSlot(Some(slot_guard)); + + let tenant = { + let Some(old_slot) = slot_guard.get_old_value() else { + anyhow::bail!( + "Tenant not found when trying to complete detaching timeline ancestor" + ); + }; + + let Some(tenant) = old_slot.get_attached() else { + anyhow::bail!("Tenant is not in attached state"); + }; + + if !tenant.is_active() { + anyhow::bail!("Tenant is not active"); + } + + tenant.clone() + }; + + let timeline = tenant.get_timeline(timeline_id, true)?; + + let reparented = timeline + .complete_detaching_timeline_ancestor(&tenant, prepared, ctx) + .await?; + + let mut slot_guard = slot_guard.into_inner(); + + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + slot_guard.drop_old_value()?; + } + Err(_barrier) => { + slot_guard.revert(); + // this really should not happen, at all, unless shutdown was already going? + anyhow::bail!("Cannot restart Tenant, already shutting down"); + } + } + + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; + + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config)?, + shard_identity, + None, + self.tenants, + SpawnMode::Eager, + ctx, + )?; + + slot_guard.upsert(TenantSlot::Attached(tenant))?; + + Ok(reparented) + } + + /// A page service client sends a TenantId, and to look up the correct Tenant we must + /// resolve this to a fully qualified TenantShardId. + /// + /// During shard splits: we shall see parent shards in InProgress state and skip them, and + /// instead match on child shards which should appear in Attached state. Very early in a shard + /// split, or in other cases where a shard is InProgress, we will return our own InProgress result + /// to instruct the caller to wait for that to finish before querying again. + pub(crate) fn resolve_attached_shard( + &self, + tenant_id: &TenantId, + selector: ShardSelector, + ) -> ShardResolveResult { + let tenants = self.tenants.read().unwrap(); + let mut want_shard = None; + let mut any_in_progress = None; + + match &*tenants { + TenantsMap::Initializing => ShardResolveResult::NotFound, + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { + for slot in m.range(TenantShardId::tenant_range(*tenant_id)) { + // Ignore all slots that don't contain an attached tenant + let tenant = match &slot.1 { + TenantSlot::Attached(t) => t, + TenantSlot::InProgress(barrier) => { + // We might still find a usable shard, but in case we don't, remember that + // we saw at least one InProgress slot, so that we can distinguish this case + // from a simple NotFound in our return value. + any_in_progress = Some(barrier.clone()); + continue; + } + _ => continue, + }; + + match selector { + ShardSelector::First => return ShardResolveResult::Found(tenant.clone()), + ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => { + return ShardResolveResult::Found(tenant.clone()) + } + ShardSelector::Page(key) => { + // First slot we see for this tenant, calculate the expected shard number + // for the key: we will use this for checking if this and subsequent + // slots contain the key, rather than recalculating the hash each time. + if want_shard.is_none() { + want_shard = Some(tenant.shard_identity.get_shard_number(&key)); + } + + if Some(tenant.shard_identity.number) == want_shard { + return ShardResolveResult::Found(tenant.clone()); + } + } + ShardSelector::Known(shard) + if tenant.shard_identity.shard_index() == shard => + { + return ShardResolveResult::Found(tenant.clone()); + } + _ => continue, + } + } + + // Fall through: we didn't find a slot that was in Attached state & matched our selector. If + // we found one or more InProgress slot, indicate to caller that they should retry later. Otherwise + // this requested shard simply isn't found. + if let Some(barrier) = any_in_progress { + ShardResolveResult::InProgress(barrier) + } else { + ShardResolveResult::NotFound + } + } + } } } #[derive(Debug, thiserror::Error)] pub(crate) enum GetTenantError { + /// NotFound is a TenantId rather than TenantShardId, because this error type is used from + /// getters that use a TenantId and a ShardSelector, not just getters that target a specific shard. #[error("Tenant {0} not found")] NotFound(TenantId), + #[error("Tenant {0} is not active")] - NotActive(TenantId), - /// Broken is logically a subset of NotActive, but a distinct error is useful as - /// NotActive is usually a retryable state for API purposes, whereas Broken - /// is a stuck error state - #[error("Tenant is broken: {0}")] - Broken(String), + NotActive(TenantShardId), // Initializing or shutting down: cannot authoritatively say whether we have this tenant #[error("Tenant map is not available: {0}")] MapState(#[from] TenantMapError), } -/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query. -/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. -/// -/// This method is cancel-safe. -pub(crate) fn get_tenant( - tenant_shard_id: TenantShardId, - active_only: bool, -) -> Result, GetTenantError> { - let locked = TENANTS.read().unwrap(); - - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; - - match peek_slot { - Some(TenantSlot::Attached(tenant)) => match tenant.current_state() { - TenantState::Broken { - reason, - backtrace: _, - } if active_only => Err(GetTenantError::Broken(reason)), - TenantState::Active => Ok(Arc::clone(tenant)), - _ => { - if active_only { - Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) - } else { - Ok(Arc::clone(tenant)) - } - } - }, - Some(TenantSlot::InProgress(_)) => { - Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) - } - None | Some(TenantSlot::Secondary(_)) => { - Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) - } - } -} - #[derive(thiserror::Error, Debug)] pub(crate) enum GetActiveTenantError { /// We may time out either while TenantSlot is InProgress, or while the Tenant @@ -1334,105 +2193,12 @@ pub(crate) enum GetActiveTenantError { /// Tenant exists, but is in a state that cannot become active (e.g. Stopping, Broken) #[error("will not become active. Current state: {0}")] WillNotBecomeActive(TenantState), -} -/// Get a [`Tenant`] in its active state. If the tenant_id is currently in [`TenantSlot::InProgress`] -/// state, then wait for up to `timeout`. If the [`Tenant`] is not currently in [`TenantState::Active`], -/// then wait for up to `timeout` (minus however long we waited for the slot). -pub(crate) async fn get_active_tenant_with_timeout( - tenant_id: TenantId, - shard_selector: ShardSelector, - timeout: Duration, - cancel: &CancellationToken, -) -> Result, GetActiveTenantError> { - enum WaitFor { - Barrier(utils::completion::Barrier), - Tenant(Arc), - } - - let wait_start = Instant::now(); - let deadline = wait_start + timeout; - - let (wait_for, tenant_shard_id) = { - let locked = TENANTS.read().unwrap(); - - // Resolve TenantId to TenantShardId - let tenant_shard_id = locked - .resolve_attached_shard(&tenant_id, shard_selector) - .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - )))?; - - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) - .map_err(GetTenantError::MapState)?; - match peek_slot { - Some(TenantSlot::Attached(tenant)) => { - match tenant.current_state() { - TenantState::Active => { - // Fast path: we don't need to do any async waiting. - return Ok(tenant.clone()); - } - _ => { - tenant.activate_now(); - (WaitFor::Tenant(tenant.clone()), tenant_shard_id) - } - } - } - Some(TenantSlot::Secondary(_)) => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( - tenant_id, - ))) - } - Some(TenantSlot::InProgress(barrier)) => { - (WaitFor::Barrier(barrier.clone()), tenant_shard_id) - } - None => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound( - tenant_id, - ))) - } - } - }; - - let tenant = match wait_for { - WaitFor::Barrier(barrier) => { - tracing::debug!("Waiting for tenant InProgress state to pass..."); - timeout_cancellable( - deadline.duration_since(Instant::now()), - cancel, - barrier.wait(), - ) - .await - .map_err(|e| match e { - TimeoutCancellableError::Timeout => GetActiveTenantError::WaitForActiveTimeout { - latest_state: None, - wait_time: wait_start.elapsed(), - }, - TimeoutCancellableError::Cancelled => GetActiveTenantError::Cancelled, - })?; - { - let locked = TENANTS.read().unwrap(); - let peek_slot = - tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read) - .map_err(GetTenantError::MapState)?; - match peek_slot { - Some(TenantSlot::Attached(tenant)) => tenant.clone(), - _ => { - return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive( - tenant_id, - ))) - } - } - } - } - WaitFor::Tenant(tenant) => tenant, - }; - - tracing::debug!("Waiting for tenant to enter active state..."); - tenant - .wait_to_become_active(deadline.duration_since(Instant::now())) - .await?; - Ok(tenant) + /// Broken is logically a subset of WillNotBecomeActive, but a distinct error is useful as + /// WillNotBecomeActive is a permitted error under some circumstances, whereas broken should + /// never happen. + #[error("Tenant is broken: {0}")] + Broken(String), } #[derive(Debug, thiserror::Error)] @@ -1447,7 +2213,7 @@ pub(crate) enum DeleteTimelineError { #[derive(Debug, thiserror::Error)] pub(crate) enum TenantStateError { #[error("Tenant {0} is stopping")] - IsStopping(TenantId), + IsStopping(TenantShardId), #[error(transparent)] SlotError(#[from] TenantSlotError), #[error(transparent)] @@ -1456,93 +2222,12 @@ pub(crate) enum TenantStateError { Other(#[from] anyhow::Error), } -pub(crate) async fn detach_tenant( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - detach_ignored: bool, - deletion_queue_client: &DeletionQueueClient, -) -> Result<(), TenantStateError> { - let tmp_path = detach_tenant0( - conf, - &TENANTS, - tenant_shard_id, - detach_ignored, - deletion_queue_client, - ) - .await?; - // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory. - // After a tenant is detached, there are no more task_mgr tasks for that tenant_id. - let task_tenant_id = None; - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::MgmtRequest, - task_tenant_id, - None, - "tenant_files_delete", - false, - async move { - fs::remove_dir_all(tmp_path.as_path()) - .await - .with_context(|| format!("tenant directory {:?} deletion", tmp_path)) - }, - ); - Ok(()) -} - -async fn detach_tenant0( - conf: &'static PageServerConf, - tenants: &std::sync::RwLock, - tenant_shard_id: TenantShardId, - detach_ignored: bool, - deletion_queue_client: &DeletionQueueClient, -) -> Result { - let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move { - let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean); - safe_rename_tenant_dir(&local_tenant_directory) - .await - .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename")) - }; - - let removal_result = remove_tenant_from_memory( - tenants, - tenant_shard_id, - tenant_dir_rename_operation(tenant_shard_id), - ) - .await; - - // Flush pending deletions, so that they have a good chance of passing validation - // before this tenant is potentially re-attached elsewhere. - deletion_queue_client.flush_advisory(); - - // Ignored tenants are not present in memory and will bail the removal from memory operation. - // Before returning the error, check for ignored tenant removal case — we only need to clean its local files then. - if detach_ignored - && matches!( - removal_result, - Err(TenantStateError::SlotError(TenantSlotError::NotFound(_))) - ) - { - let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id); - if tenant_ignore_mark.exists() { - info!("Detaching an ignored tenant"); - let tmp_path = tenant_dir_rename_operation(tenant_shard_id) - .await - .with_context(|| { - format!("Ignored tenant {tenant_shard_id} local directory rename") - })?; - return Ok(tmp_path); - } - } - - removal_result -} - pub(crate) async fn load_tenant( conf: &'static PageServerConf, tenant_id: TenantId, generation: Generation, broker_client: storage_broker::BrokerClientChannel, - remote_storage: Option, + remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, ctx: &RequestContext, ) -> Result<(), TenantMapInsertError> { @@ -1570,7 +2255,7 @@ pub(crate) async fn load_tenant( let mut location_conf = Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?; - location_conf.attach_in_generation(generation); + location_conf.attach_in_generation(AttachmentMode::Single, generation); Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?; @@ -1584,7 +2269,7 @@ pub(crate) async fn load_tenant( shard_identity, None, &TENANTS, - SpawnMode::Normal, + SpawnMode::Eager, ctx, ) .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?; @@ -1600,6 +2285,7 @@ pub(crate) async fn ignore_tenant( ignore_tenant0(conf, &TENANTS, tenant_id).await } +#[instrument(skip_all, fields(shard_id))] async fn ignore_tenant0( conf: &'static PageServerConf, tenants: &std::sync::RwLock, @@ -1607,6 +2293,10 @@ async fn ignore_tenant0( ) -> Result<(), TenantStateError> { // This is a legacy API (replaced by `/location_conf`). It does not support sharding let tenant_shard_id = TenantShardId::unsharded(tenant_id); + tracing::Span::current().record( + "shard_id", + tracing::field::display(tenant_shard_id.shard_slug()), + ); remove_tenant_from_memory(tenants, tenant_shard_id, async { let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id); @@ -1629,74 +2319,6 @@ pub(crate) enum TenantMapListError { Initializing, } -/// -/// Get list of tenants, for the mgmt API -/// -pub(crate) async fn list_tenants() -> Result, TenantMapListError> -{ - let tenants = TENANTS.read().unwrap(); - let m = match &*tenants { - TenantsMap::Initializing => return Err(TenantMapListError::Initializing), - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m, - }; - Ok(m.iter() - .filter_map(|(id, tenant)| match tenant { - TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())), - TenantSlot::Secondary(_) => None, - TenantSlot::InProgress(_) => None, - }) - .collect()) -} - -/// Execute Attach mgmt API command. -/// -/// Downloading all the tenant data is performed in the background, this merely -/// spawns the background task and returns quickly. -pub(crate) async fn attach_tenant( - conf: &'static PageServerConf, - tenant_id: TenantId, - generation: Generation, - tenant_conf: TenantConfOpt, - resources: TenantSharedResources, - ctx: &RequestContext, -) -> Result<(), TenantMapInsertError> { - // This is a legacy API (replaced by `/location_conf`). It does not support sharding - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - - let slot_guard = - tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?; - let location_conf = LocationConf::attached_single(tenant_conf, generation); - let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?; - // TODO: tenant directory remains on disk if we bail out from here on. - // See https://github.com/neondatabase/neon/issues/4233 - - let shard_identity = location_conf.shard; - let attached_tenant = tenant_spawn( - conf, - tenant_shard_id, - &tenant_dir, - resources, - AttachedTenantConf::try_from(location_conf)?, - shard_identity, - None, - &TENANTS, - SpawnMode::Normal, - ctx, - )?; - // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. - // See https://github.com/neondatabase/neon/issues/4233 - - let attached_tenant_id = attached_tenant.tenant_id(); - if tenant_id != attached_tenant_id { - return Err(TenantMapInsertError::Other(anyhow::anyhow!( - "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})", - ))); - } - - slot_guard.upsert(TenantSlot::Attached(attached_tenant))?; - Ok(()) -} - #[derive(Debug, thiserror::Error)] pub(crate) enum TenantMapInsertError { #[error(transparent)] @@ -1710,7 +2332,7 @@ pub(crate) enum TenantMapInsertError { /// Superset of TenantMapError: issues that can occur when acquiring a slot /// for a particular tenant ID. #[derive(Debug, thiserror::Error)] -pub enum TenantSlotError { +pub(crate) enum TenantSlotError { /// When acquiring a slot with the expectation that the tenant already exists. #[error("Tenant {0} not found")] NotFound(TenantShardId), @@ -1719,9 +2341,6 @@ pub enum TenantSlotError { #[error("tenant {0} already exists, state: {1:?}")] AlreadyExists(TenantShardId, TenantState), - #[error("tenant {0} already exists in but is not attached")] - Conflict(TenantShardId), - // Tried to read a slot that is currently being mutated by another administrative // operation. #[error("tenant has a state change in progress, try again later")] @@ -1733,14 +2352,31 @@ pub enum TenantSlotError { /// Superset of TenantMapError: issues that can occur when using a SlotGuard /// to insert a new value. -#[derive(Debug, thiserror::Error)] -pub enum TenantSlotUpsertError { +#[derive(thiserror::Error)] +pub(crate) enum TenantSlotUpsertError { /// An error where the slot is in an unexpected state, indicating a code bug #[error("Internal error updating Tenant")] InternalError(Cow<'static, str>), #[error(transparent)] - MapState(#[from] TenantMapError), + MapState(TenantMapError), + + // If we encounter TenantManager shutdown during upsert, we must carry the Completion + // from the SlotGuard, so that the caller can hold it while they clean up: otherwise + // TenantManager shutdown might race ahead before we're done cleaning up any Tenant that + // was protected by the SlotGuard. + #[error("Shutting down")] + ShuttingDown((TenantSlot, utils::completion::Completion)), +} + +impl std::fmt::Debug for TenantSlotUpsertError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Self::InternalError(reason) => write!(f, "Internal Error {reason}"), + Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"), + Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"), + } + } } #[derive(Debug, thiserror::Error)] @@ -1789,7 +2425,7 @@ pub struct SlotGuard { /// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will /// release any waiters as soon as this SlotGuard is dropped. - _completion: utils::completion::Completion, + completion: utils::completion::Completion, } impl SlotGuard { @@ -1802,7 +2438,7 @@ impl SlotGuard { tenant_shard_id, old_value, upserted: false, - _completion: completion, + completion, } } @@ -1835,17 +2471,27 @@ impl SlotGuard { } let m = match &mut *locked { - TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()), + TenantsMap::Initializing => { + return Err(TenantSlotUpsertError::MapState( + TenantMapError::StillInitializing, + )) + } TenantsMap::ShuttingDown(_) => { - return Err(TenantMapError::ShuttingDown.into()); + return Err(TenantSlotUpsertError::ShuttingDown(( + new_value, + self.completion.clone(), + ))); } TenantsMap::Open(m) => m, }; + METRICS.slot_inserted(&new_value); + let replaced = m.insert(self.tenant_shard_id, new_value); self.upserted = true; - - METRICS.tenant_slots.set(m.len() as u64); + if let Some(replaced) = replaced.as_ref() { + METRICS.slot_removed(replaced); + } replaced }; @@ -1885,7 +2531,9 @@ impl SlotGuard { Err(TenantSlotUpsertError::InternalError(_)) => { // We already logged the error, nothing else we can do. } - Err(TenantSlotUpsertError::MapState(_)) => { + Err( + TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_), + ) => { // If the map is shutting down, we need not replace anything } Ok(()) => {} @@ -1953,9 +2601,13 @@ impl Drop for SlotGuard { } if self.old_value_is_shutdown() { + METRICS.slot_removed(entry.get()); entry.remove(); } else { - entry.insert(self.old_value.take().unwrap()); + let inserting = self.old_value.take().unwrap(); + METRICS.slot_inserted(&inserting); + let replaced = entry.insert(inserting); + METRICS.slot_removed(&replaced); } } Entry::Vacant(_) => { @@ -1966,8 +2618,6 @@ impl Drop for SlotGuard { ); } } - - METRICS.tenant_slots.set(m.len() as u64); } } @@ -1983,18 +2633,22 @@ fn tenant_map_peek_slot<'a>( tenant_shard_id: &TenantShardId, mode: TenantSlotPeekMode, ) -> Result, TenantMapError> { - let m = match tenants.deref() { - TenantsMap::Initializing => return Err(TenantMapError::StillInitializing), + match tenants.deref() { + TenantsMap::Initializing => Err(TenantMapError::StillInitializing), TenantsMap::ShuttingDown(m) => match mode { - TenantSlotPeekMode::Read => m, - TenantSlotPeekMode::Write => { - return Err(TenantMapError::ShuttingDown); - } + TenantSlotPeekMode::Read => Ok(Some( + // When reading in ShuttingDown state, we must translate None results + // into a ShuttingDown error, because absence of a tenant shard ID in the map + // isn't a reliable indicator of the tenant being gone: it might have been + // InProgress when shutdown started, and cleaned up from that state such + // that it's now no longer in the map. Callers will have to wait until + // we next start up to get a proper answer. This avoids incorrect 404 API responses. + m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?, + )), + TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown), }, - TenantsMap::Open(m) => m, - }; - - Ok(m.get(tenant_shard_id)) + TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)), + } } enum TenantSlotAcquireMode { @@ -2022,7 +2676,7 @@ fn tenant_map_acquire_slot_impl( METRICS.tenant_slot_writes.inc(); let mut locked = tenants.write().unwrap(); - let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug()); + let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()); let _guard = span.enter(); let m = match &mut *locked { @@ -2043,7 +2697,9 @@ fn tenant_map_acquire_slot_impl( } _ => { let (completion, barrier) = utils::completion::channel(); - v.insert(TenantSlot::InProgress(barrier)); + let inserting = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&inserting); + v.insert(inserting); tracing::debug!("Vacant, inserted InProgress"); Ok(SlotGuard::new(*tenant_shard_id, None, completion)) } @@ -2079,7 +2735,10 @@ fn tenant_map_acquire_slot_impl( _ => { // Happy case: the slot was not in any state that violated our mode let (completion, barrier) = utils::completion::channel(); - let old_value = o.insert(TenantSlot::InProgress(barrier)); + let in_progress = TenantSlot::InProgress(barrier); + METRICS.slot_inserted(&in_progress); + let old_value = o.insert(in_progress); + METRICS.slot_removed(&old_value); tracing::debug!("Occupied, replaced with InProgress"); Ok(SlotGuard::new( *tenant_shard_id, @@ -2104,8 +2763,6 @@ async fn remove_tenant_from_memory( where F: std::future::Future>, { - use utils::completion; - let mut slot_guard = tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?; @@ -2117,17 +2774,17 @@ where let attached_tenant = match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { // whenever we remove a tenant from memory, we don't want to flush and wait for upload - let freeze_and_flush = false; + let shutdown_mode = ShutdownMode::Hard; // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so // that we can continue safely to cleanup. - match tenant.shutdown(progress, freeze_and_flush).await { + match tenant.shutdown(progress, shutdown_mode).await { Ok(()) => {} Err(_other) => { // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to // wait for it but return an error right away because these are distinct requests. slot_guard.revert(); - return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id)); + return Err(TenantStateError::IsStopping(tenant_shard_id)); } } Some(tenant) @@ -2174,92 +2831,86 @@ use { utils::http::error::ApiError, }; +#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))] pub(crate) async fn immediate_gc( tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, cancel: CancellationToken, ctx: &RequestContext, -) -> Result>, ApiError> { - let guard = TENANTS.read().unwrap(); - - let tenant = guard - .get(&tenant_shard_id) - .map(Arc::clone) - .with_context(|| format!("tenant {tenant_shard_id}")) - .map_err(|e| ApiError::NotFound(e.into()))?; +) -> Result { + let tenant = { + let guard = TENANTS.read().unwrap(); + guard + .get(&tenant_shard_id) + .cloned() + .with_context(|| format!("tenant {tenant_shard_id}")) + .map_err(|e| ApiError::NotFound(e.into()))? + }; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting let pitr = tenant.get_pitr_interval(); + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + // Run in task_mgr to avoid race with tenant_detach operation - let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - let (task_done, wait_task_done) = tokio::sync::oneshot::channel(); - // TODO: spawning is redundant now, need to hold the gate - task_mgr::spawn( - &tokio::runtime::Handle::current(), - TaskKind::GarbageCollector, - Some(tenant_shard_id), - Some(timeline_id), - &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"), - false, - async move { - fail::fail_point!("immediate_gc_task_pre"); + let ctx: RequestContext = + ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - #[allow(unused_mut)] - let mut result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id)) - .await; - // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it - // better once the types support it. + let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?; - #[cfg(feature = "testing")] - { - if let Ok(result) = result.as_mut() { - // why not futures unordered? it seems it needs very much the same task structure - // but would only run on single task. - let mut js = tokio::task::JoinSet::new(); - for layer in std::mem::take(&mut result.doomed_layers) { - js.spawn(layer.wait_drop()); - } - tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped"); - while let Some(res) = js.join_next().await { - res.expect("wait_drop should not panic"); - } - } + fail::fail_point!("immediate_gc_task_pre"); - let timeline = tenant.get_timeline(timeline_id, false).ok(); - let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref()); + #[allow(unused_mut)] + let mut result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) + .await; + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. - if let Some(rtc) = rtc { - // layer drops schedule actions on remote timeline client to actually do the - // deletions; don't care just exit fast about the shutdown error - drop(rtc.wait_completion().await); - } + #[cfg(feature = "testing")] + { + // we need to synchronize with drop completion for python tests without polling for + // log messages + if let Ok(result) = result.as_mut() { + let mut js = tokio::task::JoinSet::new(); + for layer in std::mem::take(&mut result.doomed_layers) { + js.spawn(layer.wait_drop()); } - - match task_done.send(result) { - Ok(_) => (), - Err(result) => error!("failed to send gc result: {result:?}"), + tracing::info!( + total = js.len(), + "starting to wait for the gc'd layers to be dropped" + ); + while let Some(res) = js.join_next().await { + res.expect("wait_drop should not panic"); } - Ok(()) } - ); - // drop the guard until after we've spawned the task so that timeline shutdown will wait for the task - drop(guard); + let timeline = tenant.get_timeline(timeline_id, false).ok(); + let rtc = timeline.as_ref().map(|x| &x.remote_client); - Ok(wait_task_done) + if let Some(rtc) = rtc { + // layer drops schedule actions on remote timeline client to actually do the + // deletions; don't care about the shutdown error, just exit fast + drop(rtc.wait_completion().await); + } + } + + result.map_err(|e| match e { + GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown, + GcError::TimelineNotFound => { + ApiError::NotFound(anyhow::anyhow!("Timeline not found").into()) + } + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + }) } #[cfg(test)] mod tests { - use pageserver_api::shard::TenantShardId; use std::collections::BTreeMap; use std::sync::Arc; - use tracing::{info_span, Instrument}; + use tracing::Instrument; use crate::tenant::mgr::TenantSlot; @@ -2270,17 +2921,16 @@ mod tests { // Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully // wait for it to complete before proceeding. - let (t, _ctx) = TenantHarness::create("shutdown_awaits_in_progress_tenant") - .unwrap() - .load() - .await; + let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap(); + let (t, _ctx) = h.load().await; // harness loads it to active, which is forced and nothing is running on the tenant - let id = TenantShardId::unsharded(t.tenant_id()); + let id = t.tenant_shard_id(); // tenant harness configures the logging and we cannot escape it - let _e = info_span!("testing", tenant_id = %id).entered(); + let span = h.span(); + let _e = span.enter(); let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]); let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants))); @@ -2301,7 +2951,7 @@ mod tests { }; super::remove_tenant_from_memory(&tenants, id, cleanup).await } - .instrument(info_span!("foobar", tenant_id = %id)) + .instrument(h.span()) }); // now the long cleanup should be in place, with the stopping state diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs deleted file mode 100644 index 3acb0fb431..0000000000 --- a/pageserver/src/tenant/par_fsync.rs +++ /dev/null @@ -1,84 +0,0 @@ -use std::{ - io, - sync::atomic::{AtomicUsize, Ordering}, -}; - -use camino::{Utf8Path, Utf8PathBuf}; - -fn fsync_path(path: &Utf8Path) -> io::Result<()> { - // TODO use VirtualFile::fsync_all once we fully go async. - let file = std::fs::File::open(path)?; - file.sync_all() -} - -fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> { - while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) { - fsync_path(path)?; - } - - Ok(()) -} - -fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> { - // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything. - - /// Use at most this number of threads. - /// Increasing this limit will - /// - use more memory - /// - increase the cost of spawn/join latency - const MAX_NUM_THREADS: usize = 64; - let num_threads = paths.len().min(MAX_NUM_THREADS); - let next_path_idx = AtomicUsize::new(0); - - std::thread::scope(|s| -> io::Result<()> { - let mut handles = vec![]; - // Spawn `num_threads - 1`, as the current thread is also a worker. - for _ in 1..num_threads { - handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx))); - } - - parallel_worker(paths, &next_path_idx)?; - - for handle in handles { - handle.join().unwrap()?; - } - - Ok(()) - }) -} - -/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool. -pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> { - if paths.len() == 1 { - fsync_path(&paths[0])?; - return Ok(()); - } - - fsync_in_thread_pool(paths) -} - -/// Parallel fsync asynchronously. -pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> { - const MAX_CONCURRENT_FSYNC: usize = 64; - let mut next = paths.iter().peekable(); - let mut js = tokio::task::JoinSet::new(); - loop { - while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() { - let next = next.next().expect("just peeked"); - let next = next.to_owned(); - js.spawn_blocking(move || fsync_path(&next)); - } - - // now the joinset has been filled up, wait for next to complete - if let Some(res) = js.join_next().await { - res??; - } else { - // last item had already completed - assert!( - next.peek().is_none(), - "joinset emptied, we shouldn't have more work" - ); - return Ok(()); - } - } -} diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 2ea3ced008..e33e4b84aa 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -91,8 +91,7 @@ //! //! The *actual* remote state lags behind the *desired* remote state while //! there are in-flight operations. -//! We keep track of the desired remote state in -//! [`UploadQueueInitialized::latest_files`] and [`UploadQueueInitialized::latest_metadata`]. +//! We keep track of the desired remote state in [`UploadQueueInitialized::dirty`]. //! It is initialized based on the [`IndexPart`] that was passed during init //! and updated with every `schedule_*` function call. //! All this is necessary necessary to compute the future [`IndexPart`]s @@ -115,8 +114,7 @@ //! //! # Completion //! -//! Once an operation has completed, we update -//! [`UploadQueueInitialized::projected_remote_consistent_lsn`] immediately, +//! Once an operation has completed, we update [`UploadQueueInitialized::clean`] immediately, //! and submit a request through the DeletionQueue to update //! [`UploadQueueInitialized::visible_remote_consistent_lsn`] after it has //! validated that our generation is not stale. It is this visible value @@ -182,13 +180,14 @@ pub(crate) mod download; pub mod index; -mod upload; +pub(crate) mod upload; use anyhow::Context; use camino::Utf8Path; use chrono::{NaiveDateTime, Utc}; pub(crate) use download::download_initdb_tar_zst; +use pageserver_api::models::AuxFilePolicy; use pageserver_api::shard::{ShardIndex, TenantShardId}; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; @@ -196,20 +195,23 @@ pub(crate) use upload::upload_initdb_dir; use utils::backoff::{ self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, }; -use utils::timeout::{timeout_cancellable, TimeoutCancellableError}; +use utils::pausable_failpoint; use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; use std::time::Duration; -use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath}; +use remote_storage::{ + DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel, +}; use std::ops::DerefMut; use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; -use crate::deletion_queue::DeletionQueueClient; +use crate::context::RequestContext; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, @@ -217,8 +219,9 @@ use crate::metrics::{ }; use crate::task_mgr::shutdown_token; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::remote_timeline_client::download::download_retry; use crate::tenant::storage_layer::AsLayerDesc; -use crate::tenant::upload_queue::Delete; +use crate::tenant::upload_queue::{Delete, UploadQueueStoppedDeletable}; use crate::tenant::TIMELINES_SEGMENT_NAME; use crate::{ config::PageServerConf, @@ -236,11 +239,14 @@ use utils::id::{TenantId, TimelineId}; use self::index::IndexPart; -use super::storage_layer::{Layer, LayerFileName, ResidentLayer}; +use super::metadata::MetadataUpdate; +use super::storage_layer::{Layer, LayerName, ResidentLayer}; use super::upload_queue::SetDeletedFlagProgress; use super::Generation; -pub(crate) use download::{is_temp_download_file, list_remote_timelines}; +pub(crate) use download::{ + download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines, +}; pub(crate) use index::LayerFileMetadata; // Occasional network issues and such can cause remote operations to fail, and @@ -257,23 +263,20 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; pub(crate) const INITDB_PATH: &str = "initdb.tar.zst"; +pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst"; + /// Default buffer size when interfacing with [`tokio::fs::File`]. pub(crate) const BUFFER_SIZE: usize = 32 * 1024; +/// Doing non-essential flushes of deletion queue is subject to this timeout, after +/// which we warn and skip. +const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10); + pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), Deleted(IndexPart), } -/// Errors that can arise when calling [`RemoteTimelineClient::stop`]. -#[derive(Debug, thiserror::Error)] -pub enum StopError { - /// Returned if the upload queue was never initialized. - /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`]. - #[error("queue is not initialized")] - QueueUninitialized, -} - #[derive(Debug, thiserror::Error)] pub enum PersistIndexPartWithDeletedFlagError { #[error("another task is already setting the deleted_flag, started at {0:?}")] @@ -314,7 +317,7 @@ pub struct RemoteTimelineClient { upload_queue: Mutex, - metrics: Arc, + pub(crate) metrics: Arc, storage_impl: GenericRemoteStorage, @@ -323,45 +326,6 @@ pub struct RemoteTimelineClient { cancel: CancellationToken, } -/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not -/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that. -const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120); -const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120); - -/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow. -/// -/// This is a convenience for the various upload functions. In future -/// the anyhow::Error result should be replaced with a more structured type that -/// enables callers to avoid handling shutdown as an error. -async fn upload_cancellable(cancel: &CancellationToken, future: F) -> anyhow::Result<()> -where - F: std::future::Future>, -{ - match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await { - Ok(Ok(())) => Ok(()), - Ok(Err(e)) => Err(e), - Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")), - Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")), - } -} -/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError. -async fn download_cancellable( - cancel: &CancellationToken, - future: F, -) -> Result -where - F: std::future::Future>, -{ - match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await { - Ok(Ok(r)) => Ok(r), - Ok(Err(e)) => Err(e), - Err(TimeoutCancellableError::Timeout) => { - Err(DownloadError::Other(anyhow::anyhow!("Timed out"))) - } - Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled), - } -} - impl RemoteTimelineClient { /// /// Create a remote storage client for given timeline @@ -437,15 +401,10 @@ impl RemoteTimelineClient { "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted" ))?; - { - let mut upload_queue = self.upload_queue.lock().unwrap(); - upload_queue.initialize_with_current_remote_index_part(index_part)?; - self.update_remote_physical_size_gauge(Some(index_part)); - } - // also locks upload queue, without dropping the guard above it will be a deadlock - self.stop().expect("initialized line above"); - let mut upload_queue = self.upload_queue.lock().unwrap(); + upload_queue.initialize_with_current_remote_index_part(index_part)?; + self.update_remote_physical_size_gauge(Some(index_part)); + self.stop_impl(&mut upload_queue); upload_queue .stopped_mut() @@ -455,11 +414,13 @@ impl RemoteTimelineClient { Ok(()) } + /// Returns `None` if nothing is yet uplodaded, `Some(disk_consistent_lsn)` otherwise. pub fn remote_consistent_lsn_projected(&self) -> Option { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, UploadQueue::Initialized(q) => q.get_last_remote_consistent_lsn_projected(), - UploadQueue::Stopped(q) => q + UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None, + UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => q .upload_queue_for_deletion .get_last_remote_consistent_lsn_projected(), } @@ -469,29 +430,40 @@ impl RemoteTimelineClient { match &mut *self.upload_queue.lock().unwrap() { UploadQueue::Uninitialized => None, UploadQueue::Initialized(q) => Some(q.get_last_remote_consistent_lsn_visible()), - UploadQueue::Stopped(q) => Some( + UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => None, + UploadQueue::Stopped(UploadQueueStopped::Deletable(q)) => Some( q.upload_queue_for_deletion .get_last_remote_consistent_lsn_visible(), ), } } + /// Returns true if this timeline was previously detached at this Lsn and the remote timeline + /// client is currently initialized. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + self.upload_queue + .lock() + .unwrap() + .initialized_mut() + .map(|uq| uq.clean.0.lineage.is_previous_ancestor_lsn(lsn)) + .unwrap_or(false) + } + fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) { let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part { current_remote_index_part .layer_metadata .values() - // If we don't have the file size for the layer, don't account for it in the metric. .map(|ilmd| ilmd.file_size) .sum() } else { 0 }; - self.metrics.remote_physical_size_set(size); + self.metrics.remote_physical_size_gauge.set(size); } pub fn get_remote_physical_size(&self) -> u64 { - self.metrics.remote_physical_size_get() + self.metrics.remote_physical_size_gauge.get() } // @@ -504,7 +476,7 @@ impl RemoteTimelineClient { /// Download index file pub async fn download_index_file( &self, - cancel: CancellationToken, + cancel: &CancellationToken, ) -> Result { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Index, @@ -514,7 +486,7 @@ impl RemoteTimelineClient { }, ); - let index_part = download::download_index_part( + let (index_part, _index_generation) = download::download_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, @@ -522,8 +494,6 @@ impl RemoteTimelineClient { cancel, ) .measure_remote_op( - self.tenant_shard_id.tenant_id, - self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Download, Arc::clone(&self.metrics), @@ -544,9 +514,11 @@ impl RemoteTimelineClient { /// On success, returns the size of the downloaded file. pub async fn download_layer_file( &self, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, + ctx: &RequestContext, ) -> anyhow::Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( @@ -563,11 +535,11 @@ impl RemoteTimelineClient { self.timeline_id, layer_file_name, layer_metadata, + local_path, cancel, + ctx, ) .measure_remote_op( - self.tenant_shard_id.tenant_id, - self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Download, Arc::clone(&self.metrics), @@ -585,9 +557,10 @@ impl RemoteTimelineClient { // Upload operations. // - /// /// Launch an index-file upload operation in the background, with - /// updated metadata. + /// fully updated metadata. + /// + /// This should only be used to upload initial metadata to remote storage. /// /// The upload will be added to the queue immediately, but it /// won't be performed until all previously scheduled layer file @@ -599,7 +572,7 @@ impl RemoteTimelineClient { /// If there were any changes to the list of files, i.e. if any /// layer file uploads were scheduled, since the last index file /// upload, those will be included too. - pub fn schedule_index_upload_for_metadata_update( + pub fn schedule_index_upload_for_full_metadata_update( self: &Arc, metadata: &TimelineMetadata, ) -> anyhow::Result<()> { @@ -608,13 +581,45 @@ impl RemoteTimelineClient { // As documented in the struct definition, it's ok for latest_metadata to be // ahead of what's _actually_ on the remote during index upload. - upload_queue.latest_metadata = metadata.clone(); + upload_queue.dirty.metadata = metadata.clone(); - self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + self.schedule_index_upload(upload_queue)?; Ok(()) } + /// Launch an index-file upload operation in the background, with only parts of the metadata + /// updated. + /// + /// This is the regular way of updating metadata on layer flushes or Gc. + /// + /// Using this lighter update mechanism allows for reparenting and detaching without changes to + /// `index_part.json`, while being more clear on what values update regularly. + pub(crate) fn schedule_index_upload_for_metadata_update( + self: &Arc, + update: &MetadataUpdate, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + upload_queue.dirty.metadata.apply(update); + + self.schedule_index_upload(upload_queue)?; + + Ok(()) + } + + /// Launch an index-file upload operation in the background, with only aux_file_policy flag updated. + pub(crate) fn schedule_index_upload_for_aux_file_policy_update( + self: &Arc, + last_aux_file_policy: Option, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.last_aux_file_policy = last_aux_file_policy; + self.schedule_index_upload(upload_queue)?; + Ok(()) + } /// /// Launch an index-file upload operation in the background, if necessary. /// @@ -630,7 +635,7 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone()); + self.schedule_index_upload(upload_queue)?; } Ok(()) @@ -640,33 +645,98 @@ impl RemoteTimelineClient { fn schedule_index_upload( self: &Arc, upload_queue: &mut UploadQueueInitialized, - metadata: TimelineMetadata, - ) { + ) -> anyhow::Result<()> { + let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn(); + // fix up the duplicated field + upload_queue.dirty.disk_consistent_lsn = disk_consistent_lsn; + + // make sure it serializes before doing it in perform_upload_task so that it doesn't + // look like a retryable error + let void = std::io::sink(); + serde_json::to_writer(void, &upload_queue.dirty).context("serialize index_part.json")?; + + let index_part = &upload_queue.dirty; + info!( - "scheduling metadata upload with {} files ({} changed)", - upload_queue.latest_files.len(), + "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)", + index_part.layer_metadata.len(), upload_queue.latest_files_changes_since_metadata_upload_scheduled, ); - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); - - let index_part = IndexPart::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata, - ); - let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn); - self.calls_unfinished_metric_begin(&op); + let op = UploadOp::UploadMetadata { + uploaded: Box::new(index_part.clone()), + }; + self.metric_begin(&op); upload_queue.queued_operations.push_back(op); upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0; // Launch the task immediately, if possible self.launch_queued_tasks(upload_queue); + Ok(()) } + pub(crate) async fn schedule_reparenting_and_wait( + self: &Arc, + new_parent: &TimelineId, + ) -> anyhow::Result<()> { + // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing + // and reads the in-memory part we cannot do the detaching like this + let receiver = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + let Some(prev) = upload_queue.dirty.metadata.ancestor_timeline() else { + return Err(anyhow::anyhow!( + "cannot reparent without a current ancestor" + )); + }; + + upload_queue.dirty.metadata.reparent(new_parent); + upload_queue.dirty.lineage.record_previous_ancestor(&prev); + + self.schedule_index_upload(upload_queue)?; + + self.schedule_barrier0(upload_queue) + }; + + Self::wait_completion0(receiver).await + } + + /// Schedules uploading a new version of `index_part.json` with the given layers added, + /// detaching from ancestor and waits for it to complete. /// - /// Launch an upload operation in the background. - /// + /// This is used with `Timeline::detach_ancestor` functionality. + pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait( + self: &Arc, + layers: &[Layer], + adopted: (TimelineId, Lsn), + ) -> anyhow::Result<()> { + let barrier = { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + + upload_queue.dirty.metadata.detach_from_ancestor(&adopted); + upload_queue.dirty.lineage.record_detaching(&adopted); + + for layer in layers { + upload_queue + .dirty + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + } + + self.schedule_index_upload(upload_queue)?; + + let barrier = self.schedule_barrier0(upload_queue); + self.launch_queued_tasks(upload_queue); + barrier + }; + + Self::wait_completion0(barrier).await + } + + /// Launch an upload operation in the background; the file is added to be included in next + /// `index_part.json` upload. pub(crate) fn schedule_layer_file_upload( self: &Arc, layer: ResidentLayer, @@ -687,13 +757,19 @@ impl RemoteTimelineClient { let metadata = layer.metadata(); upload_queue - .latest_files - .insert(layer.layer_desc().filename(), metadata.clone()); + .dirty + .layer_metadata + .insert(layer.layer_desc().layer_name(), metadata.clone()); upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; - info!("scheduled layer file upload {layer}"); + info!( + gen=?metadata.generation, + shard=?metadata.shard, + "scheduled layer file upload {layer}", + ); + let op = UploadOp::UploadLayer(layer, metadata); - self.calls_unfinished_metric_begin(&op); + self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } @@ -707,13 +783,13 @@ impl RemoteTimelineClient { /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, - names: &[LayerFileName], + names: &[LayerName], ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - let with_metadata = - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned()); + let with_metadata = self + .schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned())?; self.schedule_deletion_of_unlinked0(upload_queue, with_metadata); @@ -735,9 +811,9 @@ impl RemoteTimelineClient { // the layer files as "dangling". this is fine, at worst case we create work for the // scrubber. - let names = gc_layers.iter().map(|x| x.layer_desc().filename()); + let names = gc_layers.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; self.launch_queued_tasks(upload_queue); @@ -750,21 +826,17 @@ impl RemoteTimelineClient { self: &Arc, upload_queue: &mut UploadQueueInitialized, names: I, - ) -> Vec<(LayerFileName, LayerFileMetadata)> + ) -> anyhow::Result> where - I: IntoIterator, + I: IntoIterator, { - // Deleting layers doesn't affect the values stored in TimelineMetadata, - // so we don't need update it. Just serialize it. - let metadata = upload_queue.latest_metadata.clone(); - // Decorate our list of names with each name's metadata, dropping // names that are unexpectedly missing from our metadata. This metadata // is later used when physically deleting layers, to construct key paths. let with_metadata: Vec<_> = names .into_iter() .filter_map(|name| { - let meta = upload_queue.latest_files.remove(&name); + let meta = upload_queue.dirty.layer_metadata.remove(&name); if let Some(meta) = meta { upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1; @@ -796,17 +868,17 @@ impl RemoteTimelineClient { // index_part update, because that needs to be uploaded before we can actually delete the // files. if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 { - self.schedule_index_upload(upload_queue, metadata); + self.schedule_index_upload(upload_queue)?; } - with_metadata + Ok(with_metadata) } /// Schedules deletion for layer files which have previously been unlinked from the /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`]. pub(crate) fn schedule_deletion_of_unlinked( self: &Arc, - layers: Vec<(LayerFileName, LayerFileMetadata)>, + layers: Vec<(LayerName, LayerFileMetadata)>, ) -> anyhow::Result<()> { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -819,7 +891,7 @@ impl RemoteTimelineClient { fn schedule_deletion_of_unlinked0( self: &Arc, upload_queue: &mut UploadQueueInitialized, - mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>, + mut with_metadata: Vec<(LayerName, LayerFileMetadata)>, ) { // Filter out any layers which were not created by this tenant shard. These are // layers that originate from some ancestor shard after a split, and may still @@ -862,10 +934,14 @@ impl RemoteTimelineClient { } // schedule the actual deletions + if with_metadata.is_empty() { + // avoid scheduling the op & bumping the metric + return; + } let op = UploadOp::Delete(Delete { layers: with_metadata, }); - self.calls_unfinished_metric_begin(&op); + self.metric_begin(&op); upload_queue.queued_operations.push_back(op); } @@ -884,9 +960,9 @@ impl RemoteTimelineClient { self.schedule_layer_file_upload0(upload_queue, layer.clone()); } - let names = compacted_from.iter().map(|x| x.layer_desc().filename()); + let names = compacted_from.iter().map(|x| x.layer_desc().layer_name()); - self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names); + self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names)?; self.launch_queued_tasks(upload_queue); Ok(()) @@ -894,12 +970,18 @@ impl RemoteTimelineClient { /// Wait for all previously scheduled uploads/deletions to complete pub(crate) async fn wait_completion(self: &Arc) -> anyhow::Result<()> { - let mut receiver = { + let receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; self.schedule_barrier0(upload_queue) }; + Self::wait_completion0(receiver).await + } + + async fn wait_completion0( + mut receiver: tokio::sync::watch::Receiver<()>, + ) -> anyhow::Result<()> { if receiver.changed().await.is_err() { anyhow::bail!("wait_completion aborted because upload queue was stopped"); } @@ -933,7 +1015,7 @@ impl RemoteTimelineClient { /// Wait for all previously scheduled operations to complete, and then stop. /// /// Not cancellation safe - pub(crate) async fn shutdown(self: &Arc) -> Result<(), StopError> { + pub(crate) async fn shutdown(self: &Arc) { // On cancellation the queue is left in ackward state of refusing new operations but // proper stop is yet to be called. On cancel the original or some later task must call // `stop` or `shutdown`. @@ -944,8 +1026,12 @@ impl RemoteTimelineClient { let fut = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = match &mut *guard { - UploadQueue::Stopped(_) => return Ok(()), - UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized), + UploadQueue::Stopped(_) => return, + UploadQueue::Uninitialized => { + // transition into Stopped state + self.stop_impl(&mut guard); + return; + } UploadQueue::Initialized(ref mut init) => init, }; @@ -977,7 +1063,7 @@ impl RemoteTimelineClient { } } - self.stop() + self.stop(); } /// Set the deleted_at field in the remote index file. @@ -1011,8 +1097,7 @@ impl RemoteTimelineClient { let deleted_at = Utc::now().naive_utc(); stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at); - let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion) - .context("IndexPart serialize")?; + let mut index_part = stopped.upload_queue_for_deletion.dirty.clone(); index_part.deleted_at = Some(deleted_at); index_part }; @@ -1045,9 +1130,11 @@ impl RemoteTimelineClient { // when executed as part of tenant deletion this happens in the background 2, "persist_index_part_with_deleted_flag", - backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")), + &self.cancel, ) - .await?; + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x)?; // all good, disarm the guard and mark as success ScopeGuard::into_inner(undo_deleted_at); @@ -1067,6 +1154,142 @@ impl RemoteTimelineClient { Ok(()) } + pub(crate) fn is_deleting(&self) -> bool { + let mut locked = self.upload_queue.lock().unwrap(); + locked.stopped_mut().is_ok() + } + + pub(crate) async fn preserve_initdb_archive( + self: &Arc, + tenant_id: &TenantId, + timeline_id: &TimelineId, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + backoff::retry( + || async { + upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "preserve_initdb_tar_zst", + &cancel.clone(), + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("backing up initdb archive")?; + Ok(()) + } + + /// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload. + /// + /// This is not normally needed. + pub(crate) async fn upload_layer_file( + self: &Arc, + uploaded: &ResidentLayer, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &uploaded.layer_desc().layer_name(), + uploaded.metadata().generation, + ); + + backoff::retry( + || async { + upload::upload_timeline_layer( + &self.storage_impl, + uploaded.local_path(), + &remote_path, + uploaded.metadata().file_size, + cancel, + ) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "upload a layer without adding it to latest files", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("upload a layer without adding it to latest files") + } + + /// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is + /// not added to be part of a future `index_part.json` upload. + pub(crate) async fn copy_timeline_layer( + self: &Arc, + adopted: &Layer, + adopted_as: &Layer, + cancel: &CancellationToken, + ) -> anyhow::Result<()> { + let source_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &adopted + .get_timeline_id() + .expect("Source timeline should be alive"), + self.tenant_shard_id.to_index(), + &adopted.layer_desc().layer_name(), + adopted.metadata().generation, + ); + + let target_remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + self.tenant_shard_id.to_index(), + &adopted_as.layer_desc().layer_name(), + adopted_as.metadata().generation, + ); + + backoff::retry( + || async { + upload::copy_timeline_layer( + &self.storage_impl, + &source_remote_path, + &target_remote_path, + cancel, + ) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, + "copy timeline layer", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("remote copy timeline layer") + } + + async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> { + match tokio::time::timeout( + DELETION_QUEUE_FLUSH_TIMEOUT, + self.deletion_queue_client.flush_immediate(), + ) + .await + { + Ok(result) => result, + Err(_timeout) => { + // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and + // to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion + // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here. + tracing::warn!( + "Timed out waiting for deletion queue flush, acking deletion anyway" + ); + Ok(()) + } + } + } + /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set. /// The function deletes layer files one by one, then lists the prefix to see if we leaked something /// deletes leaked files if any and proceeds with deletion of index file at the end. @@ -1085,7 +1308,8 @@ impl RemoteTimelineClient { stopped .upload_queue_for_deletion - .latest_files + .dirty + .layer_metadata .drain() .map(|(file_name, meta)| { remote_layer_path( @@ -1102,28 +1326,41 @@ impl RemoteTimelineClient { let layer_deletion_count = layers.len(); self.deletion_queue_client.push_immediate(layers).await?; + // Delete the initdb.tar.zst, which is not always present, but deletion attempts of + // inexistant objects are not considered errors. + let initdb_path = + remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id); + self.deletion_queue_client + .push_immediate(vec![initdb_path]) + .await?; + // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id); - // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't + // Execute all pending deletions, so that when we proceed to do a listing below, we aren't // taking the burden of listing all the layers that we already know we should delete. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; - let remaining = backoff::retry( + let cancel = shutdown_token(); + + let remaining = download_retry( || async { self.storage_impl - .list_files(Some(&timeline_storage_path)) + .list( + Some(&timeline_storage_path), + ListingMode::NoDelimiter, + None, + &cancel, + ) .await }, - |_e| false, - FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "list_prefixes", - backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")), + "list remaining files", + &cancel, ) .await - .context("list prefixes")?; + .context("list files remaining files")? + .keys; // We will delete the current index_part object last, since it acts as a deletion // marker via its deleted_at attribute @@ -1149,10 +1386,8 @@ impl RemoteTimelineClient { if p == &latest_index { return false; } - if let Some(name) = p.object_name() { - if name == INITDB_PATH { - return false; - } + if p.object_name() == Some(INITDB_PRESERVED_PATH) { + return false; } true }) @@ -1185,7 +1420,7 @@ impl RemoteTimelineClient { // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait // for a flush to a persistent deletion list so that we may be sure deletion will occur. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(anyhow::anyhow!( @@ -1207,11 +1442,11 @@ impl RemoteTimelineClient { while let Some(next_op) = upload_queue.queued_operations.front() { // Can we run this task now? let can_run_now = match next_op { - UploadOp::UploadLayer(_, _) => { + UploadOp::UploadLayer(..) => { // Can always be scheduled. true } - UploadOp::UploadMetadata(_, _) => { + UploadOp::UploadMetadata { .. } => { // These can only be performed after all the preceding operations // have finished. upload_queue.inprogress_tasks.is_empty() @@ -1253,7 +1488,7 @@ impl RemoteTimelineClient { UploadOp::UploadLayer(_, _) => { upload_queue.num_inprogress_layer_uploads += 1; } - UploadOp::UploadMetadata(_, _) => { + UploadOp::UploadMetadata { .. } => { upload_queue.num_inprogress_metadata_uploads += 1; } UploadOp::Delete(_) => { @@ -1314,6 +1549,7 @@ impl RemoteTimelineClient { /// queue. /// async fn perform_upload_task(self: &Arc, task: Arc) { + let cancel = shutdown_token(); // Loop to retry until it completes. loop { // If we're requested to shut down, close up shop and exit. @@ -1325,68 +1561,73 @@ impl RemoteTimelineClient { // the Future, but we're not 100% sure if the remote storage library // is cancellation safe, so we don't dare to do that. Hopefully, the // upload finishes or times out soon enough. - if task_mgr::is_shutdown_requested() { + if cancel.is_cancelled() { info!("upload task cancelled by shutdown request"); - match self.stop() { - Ok(()) => {} - Err(StopError::QueueUninitialized) => { - unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back") - } - } + self.stop(); return; } let upload_result: anyhow::Result<()> = match &task.op { UploadOp::UploadLayer(ref layer, ref layer_metadata) => { - let path = layer.local_path(); + let local_path = layer.local_path(); + + // We should only be uploading layers created by this `Tenant`'s lifetime, so + // the metadata in the upload should always match our current generation. + assert_eq!(layer_metadata.generation, self.generation); + + let remote_path = remote_layer_path( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + layer_metadata.shard, + &layer.layer_desc().layer_name(), + layer_metadata.generation, + ); + upload::upload_timeline_layer( - self.conf, &self.storage_impl, - path, - layer_metadata, - self.generation, + local_path, + &remote_path, + layer_metadata.file_size, &self.cancel, ) .measure_remote_op( - self.tenant_shard_id.tenant_id, - self.timeline_id, RemoteOpFileKind::Layer, RemoteOpKind::Upload, Arc::clone(&self.metrics), ) .await } - UploadOp::UploadMetadata(ref index_part, _lsn) => { - let mention_having_future_layers = if cfg!(feature = "testing") { - index_part - .layer_metadata - .keys() - .any(|x| x.is_in_future(*_lsn)) - } else { - false - }; - + UploadOp::UploadMetadata { ref uploaded } => { let res = upload::upload_index_part( &self.storage_impl, &self.tenant_shard_id, &self.timeline_id, self.generation, - index_part, + uploaded, &self.cancel, ) .measure_remote_op( - self.tenant_shard_id.tenant_id, - self.timeline_id, RemoteOpFileKind::Index, RemoteOpKind::Upload, Arc::clone(&self.metrics), ) .await; if res.is_ok() { - self.update_remote_physical_size_gauge(Some(index_part)); + self.update_remote_physical_size_gauge(Some(uploaded)); + let mention_having_future_layers = if cfg!(feature = "testing") { + uploaded + .layer_metadata + .keys() + .any(|x| x.is_in_future(uploaded.metadata.disk_consistent_lsn())) + } else { + false + }; if mention_having_future_layers { // find rationale near crate::tenant::timeline::init::cleanup_future_layer - tracing::info!(disk_consistent_lsn=%_lsn, "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup"); + tracing::info!( + disk_consistent_lsn = %uploaded.metadata.disk_consistent_lsn(), + "uploaded an index_part.json with future layers -- this is ok! if shutdown now, expect future layer cleanup" + ); } } res @@ -1415,6 +1656,10 @@ impl RemoteTimelineClient { Ok(()) => { break; } + Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => { + // loop around to do the proper stopping + continue; + } Err(e) => { let retries = task.retries.fetch_add(1, Ordering::SeqCst); @@ -1440,7 +1685,7 @@ impl RemoteTimelineClient { retries, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, - &shutdown_token(), + &cancel, ) .await; } @@ -1483,11 +1728,23 @@ impl RemoteTimelineClient { upload_queue.num_inprogress_layer_uploads -= 1; None } - UploadOp::UploadMetadata(_, lsn) => { + UploadOp::UploadMetadata { ref uploaded } => { upload_queue.num_inprogress_metadata_uploads -= 1; - // XXX monotonicity check? - upload_queue.projected_remote_consistent_lsn = Some(lsn); + // the task id is reused as a monotonicity check for storing the "clean" + // IndexPart. + let last_updater = upload_queue.clean.1; + let is_later = last_updater.is_some_and(|task_id| task_id < task.task_id); + let monotone = is_later || last_updater.is_none(); + + assert!(monotone, "no two index uploads should be completing at the same time, prev={last_updater:?}, task.task_id={}", task.task_id); + + // not taking ownership is wasteful + upload_queue.clean.0.clone_from(uploaded); + upload_queue.clean.1 = Some(task.task_id); + + let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn(); + if self.generation.is_none() { // Legacy mode: skip validating generation upload_queue.visible_remote_consistent_lsn.store(lsn); @@ -1523,10 +1780,10 @@ impl RemoteTimelineClient { .await; } - self.calls_unfinished_metric_end(&task.op); + self.metric_end(&task.op); } - fn calls_unfinished_metric_impl( + fn metric_impl( &self, op: &UploadOp, ) -> Option<( @@ -1539,9 +1796,9 @@ impl RemoteTimelineClient { UploadOp::UploadLayer(_, m) => ( RemoteOpFileKind::Layer, RemoteOpKind::Upload, - RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size()), + RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size), ), - UploadOp::UploadMetadata(_, _) => ( + UploadOp::UploadMetadata { .. } => ( RemoteOpFileKind::Index, RemoteOpKind::Upload, DontTrackSize { @@ -1563,17 +1820,17 @@ impl RemoteTimelineClient { Some(res) } - fn calls_unfinished_metric_begin(&self, op: &UploadOp) { - let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { + fn metric_begin(&self, op: &UploadOp) { + let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) { Some(x) => x, None => return, }; let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes); - guard.will_decrement_manually(); // in unfinished_ops_metric_end() + guard.will_decrement_manually(); // in metric_end(), see right below } - fn calls_unfinished_metric_end(&self, op: &UploadOp) { - let (file_kind, op_kind, track_bytes) = match self.calls_unfinished_metric_impl(op) { + fn metric_end(&self, op: &UploadOp) { + let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) { Some(x) => x, None => return, }; @@ -1585,19 +1842,25 @@ impl RemoteTimelineClient { /// Use [`RemoteTimelineClient::shutdown`] for graceful stop. /// /// In-progress operations will still be running after this function returns. - /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))` + /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))` /// to wait for them to complete, after calling this function. - pub(crate) fn stop(&self) -> Result<(), StopError> { + pub(crate) fn stop(&self) { // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet. // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business. let mut guard = self.upload_queue.lock().unwrap(); - match &mut *guard { - UploadQueue::Uninitialized => Err(StopError::QueueUninitialized), + self.stop_impl(&mut guard); + } + + fn stop_impl(&self, guard: &mut std::sync::MutexGuard) { + match &mut **guard { + UploadQueue::Uninitialized => { + info!("UploadQueue is in state Uninitialized, nothing to do"); + **guard = UploadQueue::Stopped(UploadQueueStopped::Uninitialized); + } UploadQueue::Stopped(_) => { // nothing to do info!("another concurrent task already shut down the queue"); - Ok(()) } UploadQueue::Initialized(initialized) => { info!("shutting down upload queue"); @@ -1611,10 +1874,9 @@ impl RemoteTimelineClient { // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it. let upload_queue_for_deletion = UploadQueueInitialized { task_counter: 0, - latest_files: initialized.latest_files.clone(), + dirty: initialized.dirty.clone(), + clean: initialized.clean.clone(), latest_files_changes_since_metadata_upload_scheduled: 0, - latest_metadata: initialized.latest_metadata.clone(), - projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: initialized .visible_remote_consistent_lsn .clone(), @@ -1630,11 +1892,13 @@ impl RemoteTimelineClient { }; let upload_queue = std::mem::replace( - &mut *guard, - UploadQueue::Stopped(UploadQueueStopped { - upload_queue_for_deletion, - deleted_at: SetDeletedFlagProgress::NotRunning, - }), + &mut **guard, + UploadQueue::Stopped(UploadQueueStopped::Deletable( + UploadQueueStoppedDeletable { + upload_queue_for_deletion, + deleted_at: SetDeletedFlagProgress::NotRunning, + }, + )), ); if let UploadQueue::Initialized(qi) = upload_queue { qi @@ -1658,35 +1922,19 @@ impl RemoteTimelineClient { // Tear down queued ops for op in qi.queued_operations.into_iter() { - self.calls_unfinished_metric_end(&op); + self.metric_end(&op); // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err() // which is exactly what we want to happen. drop(op); } - - // We're done. - drop(guard); - Ok(()) } } } +} - pub(crate) fn get_layers_metadata( - &self, - layers: Vec, - ) -> anyhow::Result>> { - let q = self.upload_queue.lock().unwrap(); - let q = match &*q { - UploadQueue::Stopped(_) | UploadQueue::Uninitialized => { - anyhow::bail!("queue is in state {}", q.as_str()) - } - UploadQueue::Initialized(inner) => inner, - }; - - let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned()); - - Ok(decorated.collect()) - } +pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath { + let path = format!("tenants/{tenant_shard_id}"); + RemotePath::from_string(&path).expect("Failed to construct path") } pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { @@ -1694,6 +1942,11 @@ pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath { RemotePath::from_string(&path).expect("Failed to construct path") } +fn remote_timelines_path_unsharded(tenant_id: &TenantId) -> RemotePath { + let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}"); + RemotePath::from_string(&path).expect("Failed to construct path") +} + pub fn remote_timeline_path( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, @@ -1708,14 +1961,14 @@ pub fn remote_layer_path( tenant_id: &TenantId, timeline_id: &TimelineId, shard: ShardIndex, - layer_file_name: &LayerFileName, + layer_file_name: &LayerName, generation: Generation, ) -> RemotePath { // Generation-aware key format let path = format!( "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}", shard.get_suffix(), - layer_file_name.file_name(), + layer_file_name, generation.get_suffix() ); @@ -1729,6 +1982,16 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId .expect("Failed to construct path") } +pub fn remote_initdb_preserved_archive_path( + tenant_id: &TenantId, + timeline_id: &TimelineId, +) -> RemotePath { + RemotePath::from_string(&format!( + "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}" + )) + .expect("Failed to construct path") +} + pub fn remote_index_path( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, @@ -1766,29 +2029,6 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option { } } -/// Files on the remote storage are stored with paths, relative to the workdir. -/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path. -/// -/// Errors if the path provided does not start from pageserver's workdir. -pub fn remote_path( - conf: &PageServerConf, - local_path: &Utf8Path, - generation: Generation, -) -> anyhow::Result { - let stripped = local_path - .strip_prefix(&conf.workdir) - .context("Failed to strip workdir prefix")?; - - let suffixed = format!("{0}{1}", stripped, generation.get_suffix()); - - RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| { - format!( - "to resolve remote part of path {:?} for base {:?}", - local_path, conf.workdir - ) - }) -} - #[cfg(test)] mod tests { use super::*; @@ -1796,14 +2036,13 @@ mod tests { context::RequestContext, tenant::{ harness::{TenantHarness, TIMELINE_ID}, - storage_layer::Layer, - Generation, Tenant, Timeline, + storage_layer::layer::local_layer_path, + Tenant, Timeline, }, DEFAULT_PG_VERSION, }; use std::collections::HashSet; - use utils::lsn::Lsn; pub(super) fn dummy_contents(name: &str) -> Vec { format!("contents for {name}").into() @@ -1826,8 +2065,8 @@ mod tests { TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap() } - fn assert_file_list(a: &HashSet, b: &[&str]) { - let mut avec: Vec = a.iter().map(|x| x.file_name()).collect(); + fn assert_file_list(a: &HashSet, b: &[&str]) { + let mut avec: Vec = a.iter().map(|x| x.to_string()).collect(); avec.sort(); let mut bvec = b.to_vec(); @@ -1903,7 +2142,8 @@ mod tests { fn span(&self) -> tracing::Span { tracing::info_span!( "test", - tenant_id = %self.harness.tenant_id, + tenant_id = %self.harness.tenant_shard_id.tenant_id, + shard_id = %self.harness.tenant_shard_id.shard_slug(), timeline_id = %TIMELINE_ID ) } @@ -1937,11 +2177,11 @@ mod tests { tenant_ctx: _tenant_ctx, } = test_setup; - let client = timeline.remote_client.as_ref().unwrap(); + let client = &timeline.remote_client; // Download back the index.json, and check that the list of files is correct let initial_index_part = match client - .download_index_file(CancellationToken::new()) + .download_index_file(&CancellationToken::new()) .await .unwrap() { @@ -1952,7 +2192,7 @@ mod tests { .layer_metadata .keys() .map(|f| f.to_owned()) - .collect::>(); + .collect::>(); let initial_layer = { assert!(initial_layers.len() == 1); initial_layers.into_iter().next().unwrap() @@ -1978,12 +2218,21 @@ mod tests { ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz")) ] .into_iter() - .map(|(name, contents): (LayerFileName, Vec)| { - std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap(); + .map(|(name, contents): (LayerName, Vec)| { + + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &name, + &generation, + ); + std::fs::write(&local_path, &contents).unwrap(); Layer::for_resident( harness.conf, &timeline, + local_path, name, LayerFileMetadata::new(contents.len() as u64, generation, shard), ) @@ -2014,7 +2263,7 @@ mod tests { // Schedule upload of index. Check that it is queued let metadata = dummy_metadata(Lsn(0x20)); client - .schedule_index_upload_for_metadata_update(&metadata) + .schedule_index_upload_for_full_metadata_update(&metadata) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -2035,7 +2284,7 @@ mod tests { // Download back the index.json, and check that the list of files is correct let index_part = match client - .download_index_file(CancellationToken::new()) + .download_index_file(&CancellationToken::new()) .await .unwrap() { @@ -2050,9 +2299,9 @@ mod tests { .map(|f| f.to_owned()) .collect(), &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), ], ); assert_eq!(index_part.metadata, metadata); @@ -2066,7 +2315,7 @@ mod tests { // keep using schedule_layer_file_deletion because we don't have a way to wait for the // spawn_blocking started by the drop. client - .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()]) + .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()]) .unwrap(); { let mut guard = client.upload_queue.lock().unwrap(); @@ -2084,9 +2333,9 @@ mod tests { } assert_remote_files( &[ - &initial_layer.file_name(), - &layers[0].layer_desc().filename().file_name(), - &layers[1].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[0].layer_desc().layer_name().to_string(), + &layers[1].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2099,9 +2348,9 @@ mod tests { assert_remote_files( &[ - &initial_layer.file_name(), - &layers[1].layer_desc().filename().file_name(), - &layers[2].layer_desc().filename().file_name(), + &initial_layer.to_string(), + &layers[1].layer_desc().layer_name().to_string(), + &layers[2].layer_desc().layer_name().to_string(), "index_part.json", ], &remote_timeline_dir, @@ -2119,20 +2368,23 @@ mod tests { timeline, .. } = TestSetup::new("metrics").await.unwrap(); - let client = timeline.remote_client.as_ref().unwrap(); - let timeline_path = harness.timeline_path(&TIMELINE_ID); + let client = &timeline.remote_client; - let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(); + let local_path = local_layer_path( + harness.conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &layer_file_name_1, + &harness.generation, + ); let content_1 = dummy_contents("foo"); - std::fs::write( - timeline_path.join(layer_file_name_1.file_name()), - &content_1, - ) - .unwrap(); + std::fs::write(&local_path, &content_1).unwrap(); let layer_file_1 = Layer::for_resident( harness.conf, &timeline, + local_path, layer_file_name_1.clone(), LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard), ); @@ -2201,12 +2453,7 @@ mod tests { async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart { // An empty IndexPart, just sufficient to ensure deserialization will succeed - let example_metadata = TimelineMetadata::example(); - let example_index_part = IndexPart::new( - HashMap::new(), - example_metadata.disk_consistent_lsn(), - example_metadata, - ); + let example_index_part = IndexPart::example(); let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap(); @@ -2237,7 +2484,7 @@ mod tests { let client = test_state.build_client(get_generation); let download_r = client - .download_index_file(CancellationToken::new()) + .download_index_file(&CancellationToken::new()) .await .expect("download should always succeed"); assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_))); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d3956163c8..d0385e4aee 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -5,34 +5,36 @@ use std::collections::HashSet; use std::future::Future; +use std::str::FromStr; use anyhow::{anyhow, Context}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::shard::TenantShardId; use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; +use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::warn; -use utils::timeout::timeout_cancellable; -use utils::{backoff, crashsafe}; +use utils::backoff; use crate::config::PageServerConf; -use crate::tenant::remote_timeline_client::{ - download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT, -}; -use crate::tenant::storage_layer::LayerFileName; -use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::context::RequestContext; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; +use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; -use crate::virtual_file::on_fatal_io_error; +use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; -use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode}; +use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath}; use utils::crashsafe::path_with_suffix_extension; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; +use utils::pausable_failpoint; use super::index::{IndexPart, LayerFileMetadata}; use super::{ parse_remote_index_path, remote_index_path, remote_initdb_archive_path, - FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH, + remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, INITDB_PATH, }; /// @@ -40,20 +42,21 @@ use super::{ /// in the metadata. (In the future, we might do more cross-checks, like CRC validation) /// /// Returns the size of the downloaded file. +#[allow(clippy::too_many_arguments)] pub async fn download_layer_file<'a>( conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - layer_file_name: &'a LayerFileName, + layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, + local_path: &Utf8Path, cancel: &CancellationToken, + ctx: &RequestContext, ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); - let local_path = conf - .timeline_path(&tenant_shard_id, &timeline_id) - .join(layer_file_name.file_name()); + let timeline_path = conf.timeline_path(&tenant_shard_id, &timeline_id); let remote_path = remote_layer_path( &tenant_shard_id.tenant_id, @@ -73,105 +76,22 @@ pub async fn download_layer_file<'a>( // For more context about durable_rename check this email from postgres mailing list: // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com // If pageserver crashes the temp file will be deleted on startup and re-downloaded. - let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION); + let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION); - let cancel_inner = cancel.clone(); - let (mut destination_file, bytes_amount) = download_retry( - || async { - let destination_file = tokio::fs::File::create(&temp_file_path) - .await - .with_context(|| format!("create a destination file for layer '{temp_file_path}'")) - .map_err(DownloadError::Other)?; - - // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local - // file: the write to local file doesn't start until after the request header is returned - // and we start draining the body stream below - let download = download_cancellable(&cancel_inner, storage.download(&remote_path)) - .await - .with_context(|| { - format!( - "open a download stream for layer with remote storage path '{remote_path:?}'" - ) - }) - .map_err(DownloadError::Other)?; - - let mut destination_file = - tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file); - - let mut reader = tokio_util::io::StreamReader::new(download.download_stream); - - // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file, - // and we will unlink the temporary file if there is an error. This unlink is important because we - // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that - // we will imminiently try and write to again. - let bytes_amount: u64 = match timeout_cancellable( - DOWNLOAD_TIMEOUT, - &cancel_inner, - tokio::io::copy_buf(&mut reader, &mut destination_file), - ) - .await - .with_context(|| { - format!( - "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" - ) - }) - .map_err(DownloadError::Other)? - { - Ok(b) => Ok(b), - Err(e) => { - // Remove incomplete files: on restart Timeline would do this anyway, but we must - // do it here for the retry case. - if let Err(e) = tokio::fs::remove_file(&temp_file_path).await { - on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}")); - } - Err(e) - } - } - .with_context(|| { - format!( - "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}" - ) - }) - .map_err(DownloadError::Other)?; - - let destination_file = destination_file.into_inner(); - - Ok((destination_file, bytes_amount)) - }, + let bytes_amount = download_retry( + || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, &format!("download {remote_path:?}"), cancel, ) .await?; - // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: - // A file will not be closed immediately when it goes out of scope if there are any IO operations - // that have not yet completed. To ensure that a file is closed immediately when it is dropped, - // you should call flush before dropping it. - // - // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because - // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. - // But for additional safety lets check/wait for any pending operations. - destination_file - .flush() - .await - .with_context(|| format!("flush source file at {temp_file_path}")) - .map_err(DownloadError::Other)?; - - let expected = layer_metadata.file_size(); + let expected = layer_metadata.file_size; if expected != bytes_amount { return Err(DownloadError::Other(anyhow!( "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}", ))); } - // not using sync_data because it can lose file size update - destination_file - .sync_all() - .await - .with_context(|| format!("failed to fsync source file at {temp_file_path}")) - .map_err(DownloadError::Other)?; - drop(destination_file); - fail::fail_point!("remote-storage-download-pre-rename", |_| { Err(DownloadError::Other(anyhow!( "remote-storage-download-pre-rename failpoint triggered" @@ -183,19 +103,161 @@ pub async fn download_layer_file<'a>( .with_context(|| format!("rename download layer file to {local_path}")) .map_err(DownloadError::Other)?; - crashsafe::fsync_async(&local_path) - .await - .with_context(|| format!("fsync layer file {local_path}")) - .map_err(DownloadError::Other)?; + // We use fatal_err() below because the after the rename above, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let work = { + let ctx = ctx.detached_child(ctx.task_kind(), ctx.download_behavior()); + async move { + let timeline_dir = VirtualFile::open(&timeline_path, &ctx) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + }; + crate::virtual_file::io_engine::get() + .spawn_blocking_and_block_on_if_std(work) + .await; tracing::debug!("download complete: {local_path}"); Ok(bytes_amount) } +/// Download the object `src_path` in the remote `storage` to local path `dst_path`. +/// +/// If Ok() is returned, the download succeeded and the inode & data have been made durable. +/// (Note that the directory entry for the inode is not made durable.) +/// The file size in bytes is returned. +/// +/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked. +/// The unlinking has _not_ been made durable. +async fn download_object<'a>( + storage: &'a GenericRemoteStorage, + src_path: &RemotePath, + dst_path: &Utf8PathBuf, + cancel: &CancellationToken, + #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext, +) -> Result { + let res = match crate::virtual_file::io_engine::get() { + crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"), + crate::virtual_file::io_engine::IoEngine::StdFs => { + async { + let destination_file = tokio::fs::File::create(dst_path) + .await + .with_context(|| format!("create a destination file for layer '{dst_path}'")) + .map_err(DownloadError::Other)?; + + let download = storage.download(src_path, cancel).await?; + + pausable_failpoint!("before-downloading-layer-stream-pausable"); + + let mut buf_writer = + tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file); + + let mut reader = tokio_util::io::StreamReader::new(download.download_stream); + + let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?; + buf_writer.flush().await?; + + let mut destination_file = buf_writer.into_inner(); + + // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that: + // A file will not be closed immediately when it goes out of scope if there are any IO operations + // that have not yet completed. To ensure that a file is closed immediately when it is dropped, + // you should call flush before dropping it. + // + // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because + // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations. + // But for additional safety lets check/wait for any pending operations. + destination_file + .flush() + .await + .with_context(|| format!("flush source file at {dst_path}")) + .map_err(DownloadError::Other)?; + + // not using sync_data because it can lose file size update + destination_file + .sync_all() + .await + .with_context(|| format!("failed to fsync source file at {dst_path}")) + .map_err(DownloadError::Other)?; + + Ok(bytes_amount) + } + .await + } + #[cfg(target_os = "linux")] + crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { + use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; + use bytes::BytesMut; + async { + let destination_file = VirtualFile::create(dst_path, ctx) + .await + .with_context(|| format!("create a destination file for layer '{dst_path}'")) + .map_err(DownloadError::Other)?; + + let mut download = storage.download(src_path, cancel).await?; + + pausable_failpoint!("before-downloading-layer-stream-pausable"); + + // TODO: use vectored write (writev) once supported by tokio-epoll-uring. + // There's chunks_vectored() on the stream. + let (bytes_amount, destination_file) = async { + let size_tracking = size_tracking_writer::Writer::new(destination_file); + let mut buffered = owned_buffers_io::write::BufferedWriter::::new( + size_tracking, + BytesMut::with_capacity(super::BUFFER_SIZE), + ); + while let Some(res) = + futures::StreamExt::next(&mut download.download_stream).await + { + let chunk = match res { + Ok(chunk) => chunk, + Err(e) => return Err(e), + }; + buffered + .write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk), ctx) + .await?; + } + let size_tracking = buffered.flush_and_into_inner(ctx).await?; + Ok(size_tracking.into_inner()) + } + .await?; + + // not using sync_data because it can lose file size update + destination_file + .sync_all() + .await + .with_context(|| format!("failed to fsync source file at {dst_path}")) + .map_err(DownloadError::Other)?; + + Ok(bytes_amount) + } + .await + } + }; + + // in case the download failed, clean up + match res { + Ok(bytes_amount) => Ok(bytes_amount), + Err(e) => { + if let Err(e) = tokio::fs::remove_file(dst_path).await { + if e.kind() != std::io::ErrorKind::NotFound { + on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}")); + } + } + Err(e) + } + } +} + const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download"; -pub fn is_temp_download_file(path: &Utf8Path) -> bool { +pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool { let extension = path.extension(); match extension { Some(TEMP_DOWNLOAD_EXTENSION) => true, @@ -204,41 +266,31 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool { } } -/// List timelines of given tenant in remote storage -pub async fn list_remote_timelines( +async fn list_identifiers( storage: &GenericRemoteStorage, - tenant_shard_id: TenantShardId, + prefix: RemotePath, cancel: CancellationToken, -) -> anyhow::Result<(HashSet, HashSet)> { - let remote_path = remote_timelines_path(&tenant_shard_id); - - fail::fail_point!("storage-sync-list-remote-timelines", |_| { - anyhow::bail!("storage-sync-list-remote-timelines"); - }); - - let cancel_inner = cancel.clone(); +) -> anyhow::Result<(HashSet, HashSet)> +where + T: FromStr + Eq + std::hash::Hash, +{ let listing = download_retry_forever( - || { - download_cancellable( - &cancel_inner, - storage.list(Some(&remote_path), ListingMode::WithDelimiter), - ) - }, - &format!("list timelines for {tenant_shard_id}"), - cancel, + || storage.list(Some(&prefix), ListingMode::WithDelimiter, None, &cancel), + &format!("list identifiers in prefix {prefix}"), + &cancel, ) .await?; - let mut timeline_ids = HashSet::new(); + let mut parsed_ids = HashSet::new(); let mut other_prefixes = HashSet::new(); - for timeline_remote_storage_key in listing.prefixes { - let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| { - anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}") + for id_remote_storage_key in listing.prefixes { + let object_name = id_remote_storage_key.object_name().ok_or_else(|| { + anyhow::anyhow!("failed to get object name for key {id_remote_storage_key}") })?; - match object_name.parse::() { - Ok(t) => timeline_ids.insert(t), + match object_name.parse::() { + Ok(t) => parsed_ids.insert(t), Err(_) => other_prefixes.insert(object_name.to_string()), }; } @@ -250,7 +302,31 @@ pub async fn list_remote_timelines( other_prefixes.insert(object_name.to_string()); } - Ok((timeline_ids, other_prefixes)) + Ok((parsed_ids, other_prefixes)) +} + +/// List shards of given tenant in remote storage +pub(crate) async fn list_remote_tenant_shards( + storage: &GenericRemoteStorage, + tenant_id: TenantId, + cancel: CancellationToken, +) -> anyhow::Result<(HashSet, HashSet)> { + let remote_path = remote_tenant_path(&TenantShardId::unsharded(tenant_id)); + list_identifiers::(storage, remote_path, cancel).await +} + +/// List timelines of given tenant shard in remote storage +pub async fn list_remote_timelines( + storage: &GenericRemoteStorage, + tenant_shard_id: TenantShardId, + cancel: CancellationToken, +) -> anyhow::Result<(HashSet, HashSet)> { + fail::fail_point!("storage-sync-list-remote-timelines", |_| { + anyhow::bail!("storage-sync-list-remote-timelines"); + }); + + let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash(); + list_identifiers::(storage, remote_path, cancel).await } async fn do_download_index_part( @@ -258,29 +334,22 @@ async fn do_download_index_part( tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, index_generation: Generation, - cancel: CancellationToken, -) -> Result { - use futures::stream::StreamExt; - + cancel: &CancellationToken, +) -> Result<(IndexPart, Generation), DownloadError> { let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); - let cancel_inner = cancel.clone(); let index_part_bytes = download_retry_forever( || async { - // Cancellation: if is safe to cancel this future because we're just downloading into - // a memory buffer, not touching local disk. - let index_part_download = - download_cancellable(&cancel_inner, storage.download(&remote_path)).await?; + let download = storage.download(&remote_path, cancel).await?; - let mut index_part_bytes = Vec::new(); - let mut stream = std::pin::pin!(index_part_download.download_stream); - while let Some(chunk) = stream.next().await { - let chunk = chunk - .with_context(|| format!("download index part at {remote_path:?}")) - .map_err(DownloadError::Other)?; - index_part_bytes.extend_from_slice(&chunk[..]); - } - Ok(index_part_bytes) + let mut bytes = Vec::new(); + + let stream = download.download_stream; + let mut stream = StreamReader::new(stream); + + tokio::io::copy_buf(&mut stream, &mut bytes).await?; + + Ok(bytes) }, &format!("download {remote_path:?}"), cancel, @@ -288,10 +357,10 @@ async fn do_download_index_part( .await?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) - .with_context(|| format!("download index part file at {remote_path:?}")) + .with_context(|| format!("deserialize index part file at {remote_path:?}")) .map_err(DownloadError::Other)?; - Ok(index_part) + Ok((index_part, index_generation)) } /// index_part.json objects are suffixed with a generation number, so we cannot @@ -300,13 +369,13 @@ async fn do_download_index_part( /// In this function we probe for the most recent index in a generation <= our current generation. /// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md #[tracing::instrument(skip_all, fields(generation=?my_generation))] -pub(super) async fn download_index_part( +pub(crate) async fn download_index_part( storage: &GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, my_generation: Generation, - cancel: CancellationToken, -) -> Result { + cancel: &CancellationToken, +) -> Result<(IndexPart, Generation), DownloadError> { debug_assert_current_span_has_tenant_and_timeline_id(); if my_generation.is_none() { @@ -325,14 +394,8 @@ pub(super) async fn download_index_part( // index in our generation. // // This is an optimization to avoid doing the listing for the general case below. - let res = do_download_index_part( - storage, - tenant_shard_id, - timeline_id, - my_generation, - cancel.clone(), - ) - .await; + let res = + do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await; match res { Ok(index_part) => { tracing::debug!( @@ -357,7 +420,7 @@ pub(super) async fn download_index_part( tenant_shard_id, timeline_id, my_generation.previous(), - cancel.clone(), + cancel, ) .await; match res { @@ -379,16 +442,18 @@ pub(super) async fn download_index_part( // objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent // to constructing a full index path with no generation, because the generation is a suffix. let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); - let indices = backoff::retry( - || async { storage.list_files(Some(&index_prefix)).await }, - |_| false, - FAILED_DOWNLOAD_WARN_THRESHOLD, - FAILED_REMOTE_OP_RETRIES, - "listing index_part files", - backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")), + + let indices = download_retry( + || async { + storage + .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel) + .await + }, + "list index_part files", + cancel, ) - .await - .map_err(DownloadError::Other)?; + .await? + .keys; // General case logic for which index to use: the latest index whose generation // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md @@ -430,6 +495,9 @@ pub(crate) async fn download_initdb_tar_zst( let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id); + let remote_preserved_path = + remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id); + let timeline_path = conf.timelines_path(tenant_shard_id); if !timeline_path.exists() { @@ -442,8 +510,6 @@ pub(crate) async fn download_initdb_tar_zst( "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}" )); - let cancel_inner = cancel.clone(); - let file = download_retry( || async { let file = OpenOptions::new() @@ -456,18 +522,17 @@ pub(crate) async fn download_initdb_tar_zst( .with_context(|| format!("tempfile creation {temp_path}")) .map_err(DownloadError::Other)?; - let download = - download_cancellable(&cancel_inner, storage.download(&remote_path)).await?; + let download = match storage.download(&remote_path, cancel).await { + Ok(dl) => dl, + Err(DownloadError::NotFound) => { + storage.download(&remote_preserved_path, cancel).await? + } + Err(other) => Err(other)?, + }; let mut download = tokio_util::io::StreamReader::new(download.download_stream); - let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file); + let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file); - // TODO: this consumption of the response body should be subject to timeout + cancellation, but - // not without thinking carefully about how to recover safely from cancelling a write to - // local storage (e.g. by writing into a temp file as we do in download_layer) - tokio::io::copy_buf(&mut download, &mut writer) - .await - .with_context(|| format!("download initdb.tar.zst at {remote_path:?}")) - .map_err(DownloadError::Other)?; + tokio::io::copy_buf(&mut download, &mut writer).await?; let mut file = writer.into_inner(); @@ -498,12 +563,12 @@ pub(crate) async fn download_initdb_tar_zst( /// Helper function to handle retries for a download operation. /// -/// Remote operations can fail due to rate limits (IAM, S3), spurious network +/// Remote operations can fail due to rate limits (S3), spurious network /// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times, /// with backoff. /// /// (See similar logic for uploads in `perform_upload_task`) -async fn download_retry( +pub(super) async fn download_retry( op: O, description: &str, cancel: &CancellationToken, @@ -514,19 +579,21 @@ where { backoff::retry( op, - |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound), + DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, description, - backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled), + cancel, ) .await + .ok_or_else(|| DownloadError::Cancelled) + .and_then(|x| x) } async fn download_retry_forever( op: O, description: &str, - cancel: CancellationToken, + cancel: &CancellationToken, ) -> Result where O: FnMut() -> F, @@ -534,11 +601,13 @@ where { backoff::retry( op, - |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound), + DownloadError::is_permanent, FAILED_DOWNLOAD_WARN_THRESHOLD, u32::MAX, description, - backoff::Cancel::new(cancel, || DownloadError::Cancelled), + cancel, ) .await + .ok_or_else(|| DownloadError::Cancelled) + .and_then(|x| x) } diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 0abfdeef02..6233a3477e 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -5,57 +5,17 @@ use std::collections::HashMap; use chrono::NaiveDateTime; +use pageserver_api::models::AuxFilePolicy; use serde::{Deserialize, Serialize}; -use utils::bin_ser::SerializeError; +use utils::id::TimelineId; use crate::tenant::metadata::TimelineMetadata; -use crate::tenant::storage_layer::LayerFileName; -use crate::tenant::upload_queue::UploadQueueInitialized; +use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; use pageserver_api::shard::ShardIndex; use utils::lsn::Lsn; -/// Metadata gathered for each of the layer files. -/// -/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which -/// might have less or more metadata depending if upgrading or rolling back an upgrade. -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] -//#[cfg_attr(test, derive(Default))] -pub struct LayerFileMetadata { - file_size: u64, - - pub(crate) generation: Generation, - - pub(crate) shard: ShardIndex, -} - -impl From<&'_ IndexLayerMetadata> for LayerFileMetadata { - fn from(other: &IndexLayerMetadata) -> Self { - LayerFileMetadata { - file_size: other.file_size, - generation: other.generation, - shard: other.shard, - } - } -} - -impl LayerFileMetadata { - pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self { - LayerFileMetadata { - file_size, - generation, - shard, - } - } - - pub fn file_size(&self) -> u64 { - self.file_size - } -} - -// TODO seems like another part of the remote storage file format -// compatibility issue, see https://github.com/neondatabase/neon/issues/3072 /// In-memory representation of an `index_part.json` file /// /// Contains the data about all files in the timeline, present remotely and its metadata. @@ -76,15 +36,35 @@ pub struct IndexPart { /// /// Older versions of `IndexPart` will not have this property or have only a part of metadata /// that latest version stores. - pub layer_metadata: HashMap, + pub layer_metadata: HashMap, - // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. - // It's duplicated for convenience when reading the serialized structure, but is - // private because internally we would read from metadata instead. - disk_consistent_lsn: Lsn, + /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the + /// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be + /// reused. + pub(super) disk_consistent_lsn: Lsn, - #[serde(rename = "metadata_bytes")] + // TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding + // the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes" + // for backwards compatibility. + #[serde( + rename = "metadata_bytes", + alias = "metadata", + with = "crate::tenant::metadata::modern_serde" + )] pub metadata: TimelineMetadata, + + #[serde(default)] + pub(crate) lineage: Lineage, + + /// Describes the kind of aux files stored in the timeline. + /// + /// The value is modified during file ingestion when the latest wanted value communicated via tenant config is applied if it is acceptable. + /// A V1 setting after V2 files have been committed is not accepted. + /// + /// None means no aux files have been written to the storage before the point + /// when this flag is introduced. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub(crate) last_aux_file_policy: Option, } impl IndexPart { @@ -97,40 +77,35 @@ impl IndexPart { /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers /// is always generated from the keys of `layer_metadata`) /// - 4: timeline_layers is fully removed. - const LATEST_VERSION: usize = 4; + /// - 5: lineage was added + /// - 6: last_aux_file_policy is added. + /// - 7: metadata_bytes is no longer written, but still read + const LATEST_VERSION: usize = 7; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7]; pub const FILE_NAME: &'static str = "index_part.json"; - pub fn new( - layers_and_metadata: HashMap, - disk_consistent_lsn: Lsn, - metadata: TimelineMetadata, - ) -> Self { - // Transform LayerFileMetadata into IndexLayerMetadata - let layer_metadata = layers_and_metadata - .into_iter() - .map(|(k, v)| (k, IndexLayerMetadata::from(v))) - .collect(); - - Self { + pub(crate) fn empty(metadata: TimelineMetadata) -> Self { + IndexPart { version: Self::LATEST_VERSION, - layer_metadata, - disk_consistent_lsn, + layer_metadata: Default::default(), + disk_consistent_lsn: metadata.disk_consistent_lsn(), metadata, deleted_at: None, + lineage: Default::default(), + last_aux_file_policy: None, } } - pub fn get_version(&self) -> usize { + pub fn version(&self) -> usize { self.version } /// If you want this under normal operations, read it from self.metadata: /// this method is just for the scrubber to use when validating an index. - pub fn get_disk_consistent_lsn(&self) -> Lsn { + pub fn duplicated_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn } @@ -141,26 +116,23 @@ impl IndexPart { pub fn to_s3_bytes(&self) -> serde_json::Result> { serde_json::to_vec(self) } -} -impl TryFrom<&UploadQueueInitialized> for IndexPart { - type Error = SerializeError; + #[cfg(test)] + pub(crate) fn example() -> Self { + Self::empty(TimelineMetadata::example()) + } - fn try_from(upload_queue: &UploadQueueInitialized) -> Result { - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); - let metadata = upload_queue.latest_metadata.clone(); - - Ok(Self::new( - upload_queue.latest_files.clone(), - disk_consistent_lsn, - metadata, - )) + pub(crate) fn last_aux_file_policy(&self) -> Option { + self.last_aux_file_policy } } -/// Serialized form of [`LayerFileMetadata`]. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] -pub struct IndexLayerMetadata { +/// Metadata gathered for each of the layer files. +/// +/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which +/// might have less or more metadata depending if upgrading or rolling back an upgrade. +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +pub struct LayerFileMetadata { pub file_size: u64, #[serde(default = "Generation::none")] @@ -172,19 +144,86 @@ pub struct IndexLayerMetadata { pub shard: ShardIndex, } -impl From for IndexLayerMetadata { - fn from(other: LayerFileMetadata) -> Self { - IndexLayerMetadata { - file_size: other.file_size, - generation: other.generation, - shard: other.shard, +impl LayerFileMetadata { + pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self { + LayerFileMetadata { + file_size, + generation, + shard, } } } +/// Limited history of earlier ancestors. +/// +/// A timeline can have more than 1 earlier ancestor, in the rare case that it was repeatedly +/// reparented by having an later timeline be detached from it's ancestor. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)] +pub(crate) struct Lineage { + /// Has the `reparenting_history` been truncated to [`Lineage::REMEMBER_AT_MOST`]. + #[serde(skip_serializing_if = "is_false", default)] + reparenting_history_truncated: bool, + + /// Earlier ancestors, truncated when [`Self::reparenting_history_truncated`] + /// + /// These are stored in case we want to support WAL based DR on the timeline. There can be many + /// of these and at most one [`Self::original_ancestor`]. There cannot be more reparentings + /// after [`Self::original_ancestor`] has been set. + #[serde(skip_serializing_if = "Vec::is_empty", default)] + reparenting_history: Vec, + + /// The ancestor from which this timeline has been detached from and when. + /// + /// If you are adding support for detaching from a hierarchy, consider changing the ancestry + /// into a `Vec<(TimelineId, Lsn)>` to be a path instead. + #[serde(skip_serializing_if = "Option::is_none", default)] + original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>, +} + +fn is_false(b: &bool) -> bool { + !b +} + +impl Lineage { + const REMEMBER_AT_MOST: usize = 100; + + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) { + if self.reparenting_history.last() == Some(old_ancestor) { + // do not re-record it + return; + } + + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; + + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + } + + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) { + assert!(self.original_ancestor.is_none()); + + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + } + + /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed + /// to start a read/write primary at this lsn". + /// + /// Returns true if the Lsn was previously our branch point. + pub(crate) fn is_previous_ancestor_lsn(&self, lsn: Lsn) -> bool { + self.original_ancestor + .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) + } +} + #[cfg(test)] mod tests { use super::*; + use std::str::FromStr; + use utils::id::TimelineId; #[test] fn v1_indexpart_is_parsed() { @@ -203,12 +242,12 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -219,6 +258,8 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -243,12 +284,12 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 1, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -259,6 +300,8 @@ mod tests { disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), deleted_at: None, + lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -284,12 +327,12 @@ mod tests { // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? version: 2, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -299,8 +342,9 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()) + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); @@ -345,6 +389,8 @@ mod tests { ]) .unwrap(), deleted_at: None, + lineage: Lineage::default(), + last_aux_file_policy: None, }; let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap(); @@ -368,12 +414,12 @@ mod tests { let expected = IndexPart { version: 4, layer_metadata: HashMap::from([ - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { file_size: 25600000, generation: Generation::none(), shard: ShardIndex::unsharded() }), - ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { // serde_json should always parse this but this might be a double with jq for // example. file_size: 9007199254741001, @@ -383,11 +429,163 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + lineage: Lineage::default(), + last_aux_file_policy: None, }; let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); assert_eq!(part, expected); } + + #[test] + fn v5_indexpart_is_parsed() { + let example = r#"{ + "version":5, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499":{"file_size":23289856,"generation":1}, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619":{"file_size":1015808,"generation":1}}, + "disk_consistent_lsn":"0/15A7618", + "metadata_bytes":[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + } + }"#; + + let expected = IndexPart { + version: 5, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF420-00000000014EF499".parse().unwrap(), LayerFileMetadata { + file_size: 23289856, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000014EF499-00000000015A7619".parse().unwrap(), LayerFileMetadata { + file_size: 1015808, + generation: Generation::new(1), + shard: ShardIndex::unsharded(), + }) + ]), + disk_consistent_lsn: Lsn::from_str("0/15A7618").unwrap(), + metadata: TimelineMetadata::from_bytes(&[226,88,25,241,0,46,0,4,0,0,0,0,1,90,118,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,78,244,32,0,0,0,0,1,78,244,32,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: None, + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, + last_aux_file_policy: None, + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v6_indexpart_is_parsed() { + let example = r#"{ + "version":6, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "deleted_at": "2023-07-31T09:00:00.123", + "lineage":{ + "original_ancestor":["e2bfd8c633d713d279e6fcd2bcc15b6d","0/15A7618","2024-05-07T18:52:36.322426563"], + "reparenting_history":["e1bfd8c633d713d279e6fcd2bcc15b6d"] + }, + "last_aux_file_policy": "V2" + }"#; + + let expected = IndexPart { + version: 6, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + // serde_json should always parse this but this might be a double with jq for + // example. + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + lineage: Lineage { + reparenting_history_truncated: false, + reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], + original_ancestor: Some((TimelineId::from_str("e2bfd8c633d713d279e6fcd2bcc15b6d").unwrap(), Lsn::from_str("0/15A7618").unwrap(), parse_naive_datetime("2024-05-07T18:52:36.322426563"))), + }, + last_aux_file_policy: Some(AuxFilePolicy::V2), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + #[test] + fn v7_indexpart_is_parsed() { + let example = r#"{ + "version": 7, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "deleted_at": "2023-07-31T09:00:00.123" + }"#; + + let expected = IndexPart { + version: 7, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + lineage: Default::default(), + last_aux_file_policy: Default::default(), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + + fn parse_naive_datetime(s: &str) -> NaiveDateTime { + chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() + } } diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index 11c6956875..c4dd184610 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -1,36 +1,34 @@ //! Helper functions to upload files to remote storage with a RemoteStorage use anyhow::{bail, Context}; +use bytes::Bytes; use camino::Utf8Path; use fail::fail_point; use pageserver_api::shard::TenantShardId; use std::io::{ErrorKind, SeekFrom}; +use std::time::SystemTime; use tokio::fs::{self, File}; use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; +use utils::{backoff, pausable_failpoint}; +use super::index::IndexPart; use super::Generation; -use crate::{ - config::PageServerConf, - tenant::remote_timeline_client::{ - index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path, - upload_cancellable, - }, +use crate::tenant::remote_timeline_client::{ + remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path, }; -use remote_storage::GenericRemoteStorage; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError}; use utils::id::{TenantId, TimelineId}; -use super::index::LayerFileMetadata; - use tracing::info; /// Serializes and uploads the given index part data to the remote storage. -pub(super) async fn upload_index_part<'a>( +pub(crate) async fn upload_index_part<'a>( storage: &'a GenericRemoteStorage, tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, generation: Generation, - index_part: &'a IndexPart, + index_part: &IndexPart, cancel: &CancellationToken, ) -> anyhow::Result<()> { tracing::trace!("uploading new index part"); @@ -40,23 +38,22 @@ pub(super) async fn upload_index_part<'a>( }); pausable_failpoint!("before-upload-index-pausable"); - let index_part_bytes = index_part - .to_s3_bytes() - .context("serialize index part file into bytes")?; - let index_part_size = index_part_bytes.len(); - let index_part_bytes = bytes::Bytes::from(index_part_bytes); + // FIXME: this error comes too late + let serialized = index_part.to_s3_bytes()?; + let serialized = Bytes::from(serialized); + + let index_part_size = serialized.len(); let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation); - upload_cancellable( - cancel, - storage.upload_storage_object( - futures::stream::once(futures::future::ready(Ok(index_part_bytes))), + storage + .upload_storage_object( + futures::stream::once(futures::future::ready(Ok(serialized))), index_part_size, &remote_path, - ), - ) - .await - .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) + cancel, + ) + .await + .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'")) } /// Attempts to upload given layer files. @@ -64,11 +61,10 @@ pub(super) async fn upload_index_part<'a>( /// /// On an error, bumps the retries count and reschedules the entire task. pub(super) async fn upload_timeline_layer<'a>( - conf: &'static PageServerConf, storage: &'a GenericRemoteStorage, - source_path: &'a Utf8Path, - known_metadata: &'a LayerFileMetadata, - generation: Generation, + local_path: &'a Utf8Path, + remote_path: &'a RemotePath, + metadata_size: u64, cancel: &CancellationToken, ) -> anyhow::Result<()> { fail_point!("before-upload-layer", |_| { @@ -77,8 +73,7 @@ pub(super) async fn upload_timeline_layer<'a>( pausable_failpoint!("before-upload-layer-pausable"); - let storage_path = remote_path(conf, source_path, generation)?; - let source_file_res = fs::File::open(&source_path).await; + let source_file_res = fs::File::open(&local_path).await; let source_file = match source_file_res { Ok(source_file) => source_file, Err(e) if e.kind() == ErrorKind::NotFound => { @@ -89,35 +84,49 @@ pub(super) async fn upload_timeline_layer<'a>( // it has been written to disk yet. // // This is tested against `test_compaction_delete_before_upload` - info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); + info!(path = %local_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more."); return Ok(()); } - Err(e) => { - Err(e).with_context(|| format!("open a source file for layer {source_path:?}"))? - } + Err(e) => Err(e).with_context(|| format!("open a source file for layer {local_path:?}"))?, }; let fs_size = source_file .metadata() .await - .with_context(|| format!("get the source file metadata for layer {source_path:?}"))? + .with_context(|| format!("get the source file metadata for layer {local_path:?}"))? .len(); - let metadata_size = known_metadata.file_size(); if metadata_size != fs_size { - bail!("File {source_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); + bail!("File {local_path:?} has its current FS size {fs_size} diferent from initially determined {metadata_size}"); } let fs_size = usize::try_from(fs_size) - .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?; + .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?; let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); - upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None)) + storage + .upload(reader, fs_size, remote_path, None, cancel) .await - .with_context(|| format!("upload layer from local path '{source_path}'"))?; + .with_context(|| format!("upload layer from local path '{local_path}'")) +} - Ok(()) +pub(super) async fn copy_timeline_layer( + storage: &GenericRemoteStorage, + source_path: &RemotePath, + target_path: &RemotePath, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + fail_point!("before-copy-layer", |_| { + bail!("failpoint before-copy-layer") + }); + + pausable_failpoint!("before-copy-layer-pausable"); + + storage + .copy_object(source_path, target_path, cancel) + .await + .with_context(|| format!("copy layer {source_path} to {target_path}")) } /// Uploads the given `initdb` data to the remote storage. @@ -137,10 +146,66 @@ pub(crate) async fn upload_initdb_dir( let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE); let remote_path = remote_initdb_archive_path(tenant_id, timeline_id); - upload_cancellable( - cancel, - storage.upload_storage_object(file, size as usize, &remote_path), - ) - .await - .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) + storage + .upload_storage_object(file, size as usize, &remote_path, cancel) + .await + .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'")) +} + +pub(crate) async fn preserve_initdb_archive( + storage: &GenericRemoteStorage, + tenant_id: &TenantId, + timeline_id: &TimelineId, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let source_path = remote_initdb_archive_path(tenant_id, timeline_id); + let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id); + storage + .copy_object(&source_path, &dest_path, cancel) + .await + .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'")) +} + +pub(crate) async fn time_travel_recover_tenant( + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + timestamp: SystemTime, + done_if_after: SystemTime, + cancel: &CancellationToken, +) -> Result<(), TimeTravelError> { + let warn_after = 3; + let max_attempts = 10; + let mut prefixes = Vec::with_capacity(2); + if tenant_shard_id.is_shard_zero() { + // Also recover the unsharded prefix for a shard of zero: + // - if the tenant is totally unsharded, the unsharded prefix contains all the data + // - if the tenant is sharded, we still want to recover the initdb data, but we only + // want to do it once, so let's do it on the 0 shard + let timelines_path_unsharded = + super::remote_timelines_path_unsharded(&tenant_shard_id.tenant_id); + prefixes.push(timelines_path_unsharded); + } + if !tenant_shard_id.is_unsharded() { + // If the tenant is sharded, we need to recover the sharded prefix + let timelines_path = super::remote_timelines_path(tenant_shard_id); + prefixes.push(timelines_path); + } + for prefix in &prefixes { + backoff::retry( + || async { + storage + .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel) + .await + }, + |e| !matches!(e, TimeTravelError::Other(_)), + warn_after, + max_attempts, + "time travel recovery of tenant prefix", + cancel, + ) + .await + .ok_or_else(|| TimeTravelError::Cancelled) + .and_then(|x| x)?; + } + Ok(()) } diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 2331447266..af6840f525 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -3,22 +3,35 @@ pub mod heatmap; mod heatmap_uploader; mod scheduler; -use std::sync::Arc; +use std::{sync::Arc, time::SystemTime}; -use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::{ + context::RequestContext, + disk_usage_eviction_task::DiskUsageEvictionInfo, + task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, +}; use self::{ downloader::{downloader_task, SecondaryDetail}, heatmap_uploader::heatmap_uploader_task, }; -use super::{config::SecondaryLocationConfig, mgr::TenantManager}; +use super::{ + config::{SecondaryLocationConfig, TenantConfOpt}, + mgr::TenantManager, + span::debug_assert_current_span_has_tenant_id, + storage_layer::LayerName, +}; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{ + models, + shard::{ShardIdentity, TenantShardId}, +}; use remote_storage::GenericRemoteStorage; use tokio_util::sync::CancellationToken; -use utils::{completion::Barrier, sync::gate::Gate}; +use tracing::instrument; +use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate}; enum DownloadCommand { Download(TenantShardId), @@ -75,12 +88,24 @@ pub(crate) struct SecondaryTenant { pub(crate) gate: Gate, + // Secondary mode does not need the full shard identity or the TenantConfOpt. However, + // storing these enables us to report our full LocationConf, enabling convenient reconciliation + // by the control plane (see [`Self::get_location_conf`]) + shard_identity: ShardIdentity, + tenant_conf: std::sync::Mutex, + + // Internal state used by the Downloader. detail: std::sync::Mutex, + + // Public state indicating overall progress of downloads relative to the last heatmap seen + pub(crate) progress: std::sync::Mutex, } impl SecondaryTenant { pub(crate) fn new( tenant_shard_id: TenantShardId, + shard_identity: ShardIdentity, + tenant_conf: TenantConfOpt, config: &SecondaryLocationConfig, ) -> Arc { Arc::new(Self { @@ -90,12 +115,21 @@ impl SecondaryTenant { // on shutdown we walk the tenants and fire their // individual cancellations? cancel: CancellationToken::new(), - gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")), + gate: Gate::default(), + + shard_identity, + tenant_conf: std::sync::Mutex::new(tenant_conf), detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())), + + progress: std::sync::Mutex::default(), }) } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { + self.tenant_shard_id + } + pub(crate) async fn shutdown(&self) { self.cancel.cancel(); @@ -107,15 +141,100 @@ impl SecondaryTenant { self.detail.lock().unwrap().config = config.clone(); } - fn get_tenant_shard_id(&self) -> &TenantShardId { + pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) { + *(self.tenant_conf.lock().unwrap()) = config.clone(); + } + + /// For API access: generate a LocationConfig equivalent to the one that would be used to + /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively + /// rare external API calls, like a reconciliation at startup. + pub(crate) fn get_location_conf(&self) -> models::LocationConfig { + let conf = self.detail.lock().unwrap().config.clone(); + + let conf = models::LocationConfigSecondary { warm: conf.warm }; + + let tenant_conf = self.tenant_conf.lock().unwrap().clone(); + models::LocationConfig { + mode: models::LocationConfigMode::Secondary, + generation: None, + secondary_conf: Some(conf), + shard_number: self.tenant_shard_id.shard_number.0, + shard_count: self.tenant_shard_id.shard_count.literal(), + shard_stripe_size: self.shard_identity.stripe_size.0, + tenant_conf: tenant_conf.into(), + } + } + + pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId { &self.tenant_shard_id } + + pub(crate) fn get_layers_for_eviction(self: &Arc) -> (DiskUsageEvictionInfo, usize) { + self.detail.lock().unwrap().get_layers_for_eviction(self) + } + + /// Cancellation safe, but on cancellation the eviction will go through + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))] + pub(crate) async fn evict_layer(self: &Arc, timeline_id: TimelineId, name: LayerName) { + debug_assert_current_span_has_tenant_id(); + + let guard = match self.gate.enter() { + Ok(g) => g, + Err(_) => { + tracing::debug!("Dropping layer evictions, secondary tenant shutting down",); + return; + } + }; + + let now = SystemTime::now(); + tracing::info!("Evicting secondary layer"); + + let this = self.clone(); + + // spawn it to be cancellation safe + tokio::task::spawn_blocking(move || { + let _guard = guard; + + // Update the timeline's state. This does not have to be synchronized with + // the download process, because: + // - If downloader is racing with us to remove a file (e.g. because it is + // removed from heatmap), then our mutual .remove() operations will both + // succeed. + // - If downloader is racing with us to download the object (this would require + // multiple eviction iterations to race with multiple download iterations), then + // if we remove it from the state, the worst that happens is the downloader + // downloads it again before re-inserting, or we delete the file but it remains + // in the state map (in which case it will be downloaded if this secondary + // tenant transitions to attached and tries to access it) + // + // The important assumption here is that the secondary timeline state does not + // have to 100% match what is on disk, because it's a best-effort warming + // of the cache. + let mut detail = this.detail.lock().unwrap(); + if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { + let removed = timeline_detail.on_disk_layers.remove(&name); + + // We might race with removal of the same layer during downloads, if it was removed + // from the heatmap. If we see that the OnDiskState is gone, then no need to + // do a physical deletion or store in evicted_at. + if let Some(removed) = removed { + removed.remove_blocking(); + timeline_detail.evicted_at.insert(name, now); + } + } + }) + .await + .expect("secondary eviction should not have panicked"); + } } /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads, -/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests, -/// where we want to immediately upload/download for a particular tenant. In normal operation -/// uploads & downloads are autonomous and not driven by this interface. +/// and heatmap uploads. This is not a hot data path: it's used for: +/// - Live migrations, where we want to ensure a migration destination has the freshest possible +/// content before trying to cut over. +/// - Tests, where we want to immediately upload/download for a particular tenant. +/// +/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface. pub struct SecondaryController { upload_req_tx: tokio::sync::mpsc::Sender>, download_req_tx: tokio::sync::mpsc::Sender>, @@ -173,9 +292,13 @@ pub fn spawn_tasks( let (upload_req_tx, upload_req_rx) = tokio::sync::mpsc::channel::>(16); + let downloader_task_ctx = RequestContext::new( + TaskKind::SecondaryDownloads, + crate::context::DownloadBehavior::Download, + ); task_mgr::spawn( BACKGROUND_RUNTIME.handle(), - TaskKind::SecondaryDownloads, + downloader_task_ctx.task_kind(), None, None, "secondary tenant downloads", @@ -187,6 +310,7 @@ pub fn spawn_tasks( download_req_rx, bg_jobs_clone, cancel_clone, + downloader_task_ctx, ) .await; diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 6fdee08a4e..24176ecf19 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -8,24 +8,33 @@ use std::{ use crate::{ config::PageServerConf, + context::RequestContext, + disk_usage_eviction_task::{ + finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, + }, metrics::SECONDARY_MODE, tenant::{ config::SecondaryLocationConfig, debug_assert_current_span_has_tenant_and_timeline_id, + ephemeral_file::is_ephemeral_file, remote_timeline_client::{ - index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, + index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD, + FAILED_REMOTE_OP_RETRIES, }, span::debug_assert_current_span_has_tenant_id, - storage_layer::LayerFileName, + storage_layer::{layer::local_layer_path, LayerName}, tasks::{warn_when_period_overrun, BackgroundLoopKind}, }, virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}, - METADATA_FILE_NAME, TEMP_FILE_SUFFIX, + TEMP_FILE_SUFFIX, }; use super::{ heatmap::HeatMapLayer, - scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs}, + scheduler::{ + self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, + TenantBackgroundJobs, + }, SecondaryTenant, }; @@ -34,16 +43,18 @@ use crate::tenant::{ remote_timeline_client::{download::download_layer_file, remote_heatmap_path}, }; +use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; +use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use rand::Rng; -use remote_storage::{DownloadError, GenericRemoteStorage}; +use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; -use tracing::{info_span, instrument, Instrument}; +use tracing::{info_span, instrument, warn, Instrument}; use utils::{ - backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId, + backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext, + id::TimelineId, serde_system_time, }; use super::{ @@ -51,14 +62,10 @@ use super::{ CommandRequest, DownloadCommand, }; -/// For each tenant, how long must have passed since the last download_tenant call before -/// calling it again. This is approximately the time by which local data is allowed -/// to fall behind remote data. -/// -/// TODO: this should just be a default, and the actual period should be controlled -/// via the heatmap itself -/// `` -const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000); +/// For each tenant, default period for how long must have passed since the last download_tenant call before +/// calling it again. This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first +/// download, if the uploader populated it. +const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000); pub(super) async fn downloader_task( tenant_manager: Arc, @@ -66,30 +73,34 @@ pub(super) async fn downloader_task( command_queue: tokio::sync::mpsc::Receiver>, background_jobs_can_start: Barrier, cancel: CancellationToken, + root_ctx: RequestContext, ) { let concurrency = tenant_manager.get_conf().secondary_download_concurrency; let generator = SecondaryDownloader { tenant_manager, remote_storage, + root_ctx, }; let mut scheduler = Scheduler::new(generator, concurrency); scheduler .run(command_queue, background_jobs_can_start, cancel) - .instrument(info_span!("secondary_downloads")) + .instrument(info_span!("secondary_download_scheduler")) .await } struct SecondaryDownloader { tenant_manager: Arc, remote_storage: GenericRemoteStorage, + root_ctx: RequestContext, } #[derive(Debug, Clone)] pub(super) struct OnDiskState { metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, } impl OnDiskState { @@ -97,23 +108,46 @@ impl OnDiskState { _conf: &'static PageServerConf, _tenant_shard_id: &TenantShardId, _imeline_id: &TimelineId, - _ame: LayerFileName, + _ame: LayerName, metadata: LayerFileMetadata, access_time: SystemTime, + local_path: Utf8PathBuf, ) -> Self { Self { metadata, access_time, + local_path, } } + + // This is infallible, because all errors are either acceptable (ENOENT), or totally + // unexpected (fatal). + pub(super) fn remove_blocking(&self) { + // We tolerate ENOENT, because between planning eviction and executing + // it, the secondary downloader could have seen an updated heatmap that + // resulted in a layer being deleted. + // Other local I/O errors are process-fatal: these should never happen. + std::fs::remove_file(&self.local_path) + .or_else(fs_ext::ignore_not_found) + .fatal_err("Deleting secondary layer") + } } #[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { - pub(super) on_disk_layers: HashMap, + pub(super) on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. - pub(super) evicted_at: HashMap, + pub(super) evicted_at: HashMap, +} + +// Aspects of a heatmap that we remember after downloading it +#[derive(Clone, Debug)] +struct DownloadSummary { + etag: Etag, + #[allow(unused)] + mtime: SystemTime, + upload_period: Duration, } /// This state is written by the secondary downloader, it is opaque @@ -122,7 +156,7 @@ pub(super) struct SecondaryDetailTimeline { pub(super) struct SecondaryDetail { pub(super) config: SecondaryLocationConfig, - last_download: Option, + last_download: Option, next_download: Option, pub(super) timelines: HashMap, } @@ -133,6 +167,20 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat> { datetime.format("%d/%m/%Y %T") } +/// Information returned from download function when it detects the heatmap has changed +struct HeatMapModified { + etag: Etag, + last_modified: SystemTime, + bytes: Vec, +} + +enum HeatMapDownload { + // The heatmap's etag has changed: return the new etag, mtime and the body bytes + Modified(HeatMapModified), + // The heatmap's etag is unchanged + Unmodified, +} + impl SecondaryDetail { pub(super) fn new(config: SecondaryLocationConfig) -> Self { Self { @@ -142,13 +190,57 @@ impl SecondaryDetail { timelines: HashMap::new(), } } + + /// Additionally returns the total number of layers, used for more stable relative access time + /// based eviction. + pub(super) fn get_layers_for_eviction( + &self, + parent: &Arc, + ) -> (DiskUsageEvictionInfo, usize) { + let mut result = DiskUsageEvictionInfo::default(); + let mut total_layers = 0; + + for (timeline_id, timeline_detail) in &self.timelines { + result + .resident_layers + .extend(timeline_detail.on_disk_layers.iter().map(|(name, ods)| { + EvictionCandidate { + layer: EvictionLayer::Secondary(EvictionSecondaryLayer { + secondary_tenant: parent.clone(), + timeline_id: *timeline_id, + name: name.clone(), + metadata: ods.metadata.clone(), + }), + last_activity_ts: ods.access_time, + relative_last_activity: finite_f32::FiniteF32::ZERO, + } + })); + + // total might be missing currently downloading layers, but as a lower than actual + // value it is good enough approximation. + total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len(); + } + result.max_layer_size = result + .resident_layers + .iter() + .map(|l| l.layer.get_file_size()) + .max(); + + tracing::debug!( + "eviction: secondary tenant {} found {} timelines, {} layers", + parent.get_tenant_shard_id(), + self.timelines.len(), + result.resident_layers.len() + ); + + (result, total_layers) + } } struct PendingDownload { secondary_state: Arc, - last_download: Option, + last_download: Option, target_time: Option, - period: Option, } impl scheduler::PendingJob for PendingDownload { @@ -186,7 +278,6 @@ type Scheduler = TenantBackgroundJobs< DownloadCommand, >; -#[async_trait::async_trait] impl JobGenerator for SecondaryDownloader { @@ -199,10 +290,17 @@ impl JobGenerator SchedulingResult { @@ -233,23 +331,20 @@ impl JobGenerator next_download { Some(PendingDownload { secondary_state: secondary_tenant, last_download, target_time: Some(next_download), - period: Some(DOWNLOAD_FRESHEN_INTERVAL), }) } else { None @@ -270,14 +365,11 @@ impl JobGenerator { @@ -315,7 +407,7 @@ impl JobGenerator { - tracing::debug!("Shut down while downloading"); + tracing::info!("Shut down while downloading"); }, Err(UpdateError::Deserialize(e)) => { tracing::error!("Corrupt content while downloading tenant: {e}"); @@ -330,26 +422,21 @@ impl JobGenerator for UpdateError { fn from(value: std::io::Error) -> Self { if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) { UpdateError::NoSpace + } else if value + .get_ref() + .and_then(|x| x.downcast_ref::()) + .is_some() + { + UpdateError::from(DownloadError::from(value)) } else { - // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue + // An I/O error from e.g. tokio::io::copy_buf is most likely a remote storage issue UpdateError::Other(anyhow::anyhow!(value)) } } @@ -413,22 +506,42 @@ impl<'a> TenantDownloader<'a> { } } - async fn download(&self) -> Result<(), UpdateError> { + async fn download(&self, ctx: &RequestContext) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_id(); // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure // cover our access to local storage. let Ok(_guard) = self.secondary_state.gate.enter() else { // Shutting down - return Ok(()); + return Err(UpdateError::Cancelled); }; let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + + // We will use the etag from last successful download to make the download conditional on changes + let last_download = self + .secondary_state + .detail + .lock() + .unwrap() + .last_download + .clone(); + // Download the tenant's heatmap - let heatmap_bytes = tokio::select!( - bytes = self.download_heatmap() => {bytes?}, + let HeatMapModified { + last_modified: heatmap_mtime, + etag: heatmap_etag, + bytes: heatmap_bytes, + } = match tokio::select!( + bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?}, _ = self.secondary_state.cancel.cancelled() => return Ok(()) - ); + ) { + HeatMapDownload::Unmodified => { + tracing::info!("Heatmap unchanged since last successful download"); + return Ok(()); + } + HeatMapDownload::Modified(m) => m, + }; let heatmap = serde_json::from_slice::(&heatmap_bytes)?; @@ -439,25 +552,73 @@ impl<'a> TenantDownloader<'a> { let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX); let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}"); let heatmap_path_bg = heatmap_path.clone(); - tokio::task::spawn_blocking(move || { - tokio::runtime::Handle::current().block_on(async move { - VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await - }) - }) - .await - .expect("Blocking task is never aborted") - .maybe_fatal_err(&context_msg)?; + VirtualFile::crashsafe_overwrite(heatmap_path_bg, temp_path, heatmap_bytes) + .await + .maybe_fatal_err(&context_msg)?; - tracing::debug!("Wrote local heatmap to {}", heatmap_path); + tracing::debug!( + "Wrote local heatmap to {}, with {} timelines", + heatmap_path, + heatmap.timelines.len() + ); + + // Get or initialize the local disk state for the timelines we will update + let mut timeline_states = HashMap::new(); + for timeline in &heatmap.timelines { + let timeline_state = self + .secondary_state + .detail + .lock() + .unwrap() + .timelines + .get(&timeline.timeline_id) + .cloned(); + + let timeline_state = match timeline_state { + Some(t) => t, + None => { + // We have no existing state: need to scan local disk for layers first. + let timeline_state = + init_timeline_state(self.conf, tenant_shard_id, timeline).await; + + // Re-acquire detail lock now that we're done with async load from local FS + self.secondary_state + .detail + .lock() + .unwrap() + .timelines + .insert(timeline.timeline_id, timeline_state.clone()); + timeline_state + } + }; + + timeline_states.insert(timeline.timeline_id, timeline_state); + } + + // Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general + // principle that deletions should be done before writes wherever possible, and so that we can use this + // phase to initialize our SecondaryProgress. + { + *self.secondary_state.progress.lock().unwrap() = + self.prepare_timelines(&heatmap, heatmap_mtime).await?; + } // Download the layers in the heatmap for timeline in heatmap.timelines { + let timeline_state = timeline_states + .remove(&timeline.timeline_id) + .expect("Just populated above"); + if self.secondary_state.cancel.is_cancelled() { + tracing::debug!( + "Cancelled before downloading timeline {}", + timeline.timeline_id + ); return Ok(()); } let timeline_id = timeline.timeline_id; - self.download_timeline(timeline) + self.download_timeline(timeline, timeline_state, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, @@ -467,117 +628,256 @@ impl<'a> TenantDownloader<'a> { .await?; } + // Only update last_etag after a full successful download: this way will not skip + // the next download, even if the heatmap's actual etag is unchanged. + self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { + etag: heatmap_etag, + mtime: heatmap_mtime, + upload_period: heatmap + .upload_period_ms + .map(|ms| Duration::from_millis(ms as u64)) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL), + }); + + // Robustness: we should have updated progress properly, but in case we didn't, make sure + // we don't leave the tenant in a state where we claim to have successfully downloaded + // everything, but our progress is incomplete. The invariant here should be that if + // we have set `last_download` to this heatmap's etag, then the next time we see that + // etag we can safely do no work (i.e. we must be complete). + let mut progress = self.secondary_state.progress.lock().unwrap(); + debug_assert!(progress.layers_downloaded == progress.layers_total); + debug_assert!(progress.bytes_downloaded == progress.bytes_total); + if progress.layers_downloaded != progress.layers_total + || progress.bytes_downloaded != progress.bytes_total + { + tracing::warn!("Correcting drift in progress stats ({progress:?})"); + progress.layers_downloaded = progress.layers_total; + progress.bytes_downloaded = progress.bytes_total; + } + Ok(()) } - async fn download_heatmap(&self) -> Result, UpdateError> { + /// Do any fast local cleanup that comes before the much slower process of downloading + /// layers from remote storage. In the process, initialize the SecondaryProgress object + /// that will later be updated incrementally as we download layers. + async fn prepare_timelines( + &self, + heatmap: &HeatMapTenant, + heatmap_mtime: SystemTime, + ) -> Result { + let heatmap_stats = heatmap.get_stats(); + // We will construct a progress object, and then populate its initial "downloaded" numbers + // while iterating through local layer state in [`Self::prepare_timelines`] + let mut progress = SecondaryProgress { + layers_total: heatmap_stats.layers, + bytes_total: heatmap_stats.bytes, + heatmap_mtime: Some(serde_system_time::SystemTime(heatmap_mtime)), + layers_downloaded: 0, + bytes_downloaded: 0, + }; + // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock + let mut delete_layers = Vec::new(); + let mut delete_timelines = Vec::new(); + { + let mut detail = self.secondary_state.detail.lock().unwrap(); + for (timeline_id, timeline_state) in &mut detail.timelines { + let Some(heatmap_timeline_index) = heatmap + .timelines + .iter() + .position(|t| t.timeline_id == *timeline_id) + else { + // This timeline is no longer referenced in the heatmap: delete it locally + delete_timelines.push(*timeline_id); + continue; + }; + + let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap(); + + let layers_in_heatmap = heatmap_timeline + .layers + .iter() + .map(|l| (&l.name, l.metadata.generation)) + .collect::>(); + let layers_on_disk = timeline_state + .on_disk_layers + .iter() + .map(|l| (l.0, l.1.metadata.generation)) + .collect::>(); + + let mut layer_count = layers_on_disk.len(); + let mut layer_byte_count: u64 = timeline_state + .on_disk_layers + .values() + .map(|l| l.metadata.file_size) + .sum(); + + // Remove on-disk layers that are no longer present in heatmap + for (layer_file_name, generation) in layers_on_disk.difference(&layers_in_heatmap) { + layer_count -= 1; + layer_byte_count -= timeline_state + .on_disk_layers + .get(layer_file_name) + .unwrap() + .metadata + .file_size; + + let local_path = local_layer_path( + self.conf, + self.secondary_state.get_tenant_shard_id(), + timeline_id, + layer_file_name, + generation, + ); + + delete_layers.push((*timeline_id, (*layer_file_name).clone(), local_path)); + } + + progress.bytes_downloaded += layer_byte_count; + progress.layers_downloaded += layer_count; + } + + for delete_timeline in &delete_timelines { + // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal + // from disk fails that will be a fatal error. + detail.timelines.remove(delete_timeline); + } + } + + // Execute accumulated deletions + for (timeline_id, layer_name, local_path) in delete_layers { + tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",); + + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found) + .maybe_fatal_err("Removing secondary layer")?; + + // Update in-memory housekeeping to reflect the absence of the deleted layer + let mut detail = self.secondary_state.detail.lock().unwrap(); + let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else { + continue; + }; + timeline_state.on_disk_layers.remove(&layer_name); + } + + for timeline_id in delete_timelines { + let timeline_path = self + .conf + .timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id); + tracing::info!(timeline_id=%timeline_id, + "Timeline no longer in heatmap, removing from secondary location" + ); + tokio::fs::remove_dir_all(&timeline_path) + .await + .or_else(fs_ext::ignore_not_found) + .maybe_fatal_err("Removing secondary timeline")?; + } + + Ok(progress) + } + + /// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object + /// still matches `prev_etag`. + async fn download_heatmap( + &self, + prev_etag: Option<&Etag>, + ) -> Result { debug_assert_current_span_has_tenant_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - // TODO: make download conditional on ETag having changed since last download + // TODO: pull up etag check into the request, to do a conditional GET rather than + // issuing a GET and then maybe ignoring the response body // (https://github.com/neondatabase/neon/issues/6199) tracing::debug!("Downloading heatmap for secondary tenant",); let heatmap_path = remote_heatmap_path(tenant_shard_id); + let cancel = &self.secondary_state.cancel; - let heatmap_bytes = backoff::retry( + backoff::retry( || async { let download = self .remote_storage - .download(&heatmap_path) + .download(&heatmap_path, cancel) .await .map_err(UpdateError::from)?; - let mut heatmap_bytes = Vec::new(); - let mut body = tokio_util::io::StreamReader::new(download.download_stream); - let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?; - Ok(heatmap_bytes) + + SECONDARY_MODE.download_heatmap.inc(); + + if Some(&download.etag) == prev_etag { + Ok(HeatMapDownload::Unmodified) + } else { + let mut heatmap_bytes = Vec::new(); + let mut body = tokio_util::io::StreamReader::new(download.download_stream); + let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; + Ok(HeatMapDownload::Modified(HeatMapModified { + etag: download.etag, + last_modified: download.last_modified, + bytes: heatmap_bytes, + })) + } }, |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled), FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, "download heatmap", - backoff::Cancel::new(self.secondary_state.cancel.clone(), || { - UpdateError::Cancelled - }), + cancel, ) - .await?; - - SECONDARY_MODE.download_heatmap.inc(); - - Ok(heatmap_bytes) + .await + .ok_or_else(|| UpdateError::Cancelled) + .and_then(|x| x) } - async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> { + async fn download_timeline( + &self, + timeline: HeatMapTimeline, + timeline_state: SecondaryDetailTimeline, + ctx: &RequestContext, + ) -> Result<(), UpdateError> { debug_assert_current_span_has_tenant_and_timeline_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - let timeline_path = self - .conf - .timeline_path(tenant_shard_id, &timeline.timeline_id); // Accumulate updates to the state let mut touched = Vec::new(); - // Clone a view of what layers already exist on disk - let timeline_state = self - .secondary_state - .detail - .lock() - .unwrap() - .timelines - .get(&timeline.timeline_id) - .cloned(); - - let timeline_state = match timeline_state { - Some(t) => t, - None => { - // We have no existing state: need to scan local disk for layers first. - let timeline_state = - init_timeline_state(self.conf, tenant_shard_id, &timeline).await; - - // Re-acquire detail lock now that we're done with async load from local FS - self.secondary_state - .detail - .lock() - .unwrap() - .timelines - .insert(timeline.timeline_id, timeline_state.clone()); - timeline_state - } - }; - - let layers_in_heatmap = timeline - .layers - .iter() - .map(|l| &l.name) - .collect::>(); - let layers_on_disk = timeline_state - .on_disk_layers - .iter() - .map(|l| l.0) - .collect::>(); - - // Remove on-disk layers that are no longer present in heatmap - for layer in layers_on_disk.difference(&layers_in_heatmap) { - let local_path = timeline_path.join(layer.to_string()); - tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",); - tokio::fs::remove_file(&local_path) - .await - .or_else(fs_ext::ignore_not_found) - .maybe_fatal_err("Removing secondary layer")?; - } + tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); // Download heatmap layers that are not present on local disk, or update their // access time if they are already present. for layer in timeline.layers { if self.secondary_state.cancel.is_cancelled() { - return Ok(()); + tracing::debug!("Cancelled -- dropping out of layer loop"); + return Err(UpdateError::Cancelled); } // Existing on-disk layers: just update their access time. if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { tracing::debug!("Layer {} is already on disk", layer.name); - if on_disk.metadata != LayerFileMetadata::from(&layer.metadata) - || on_disk.access_time != layer.access_time - { + + if cfg!(debug_assertions) { + // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think + // are already present on disk are really there. + match tokio::fs::metadata(&on_disk.local_path).await { + Ok(meta) => { + tracing::debug!( + "Layer {} present at {}, size {}", + layer.name, + on_disk.local_path, + meta.len(), + ); + } + Err(e) => { + tracing::warn!( + "Layer {} not found at {} ({})", + layer.name, + on_disk.local_path, + e + ); + debug_assert!(false); + } + } + } + + if on_disk.metadata != layer.metadata || on_disk.access_time != layer.access_time { // We already have this layer on disk. Update its access time. tracing::debug!( "Access time updated for layer {}: {} -> {}", @@ -609,56 +909,21 @@ impl<'a> TenantDownloader<'a> { strftime(&layer.access_time), strftime(evicted_at) ); + self.skip_layer(layer); continue; } } - // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally - let downloaded_bytes = match download_layer_file( - self.conf, - self.remote_storage, - *tenant_shard_id, - timeline.timeline_id, - &layer.name, - &LayerFileMetadata::from(&layer.metadata), - &self.secondary_state.cancel, - ) - .await + match self + .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx) + .await? { - Ok(bytes) => bytes, - Err(e) => { - if let DownloadError::NotFound = e { - // A heatmap might be out of date and refer to a layer that doesn't exist any more. - // This is harmless: continue to download the next layer. It is expected during compaction - // GC. - tracing::debug!( - "Skipped downloading missing layer {}, raced with compaction/gc?", - layer.name - ); - continue; - } else { - return Err(e.into()); - } + Some(layer) => touched.push(layer), + None => { + // Not an error but we didn't download it: remote layer is missing. Don't add it to the list of + // things to consider touched. } - }; - - if downloaded_bytes != layer.metadata.file_size { - let local_path = timeline_path.join(layer.name.to_string()); - - tracing::warn!( - "Downloaded layer {} with unexpected size {} != {}. Removing download.", - layer.name, - downloaded_bytes, - layer.metadata.file_size - ); - - tokio::fs::remove_file(&local_path) - .await - .or_else(fs_ext::ignore_not_found)?; } - - SECONDARY_MODE.download_layer.inc(); - touched.push(layer) } // Write updates to state to record layers we just downloaded or touched. @@ -675,13 +940,21 @@ impl<'a> TenantDownloader<'a> { v.get_mut().access_time = t.access_time; } Entry::Vacant(e) => { + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + &timeline.timeline_id, + &t.name, + &t.metadata.generation, + ); e.insert(OnDiskState::new( self.conf, tenant_shard_id, &timeline.timeline_id, t.name, - LayerFileMetadata::from(&t.metadata), + t.metadata.clone(), t.access_time, + local_path, )); } } @@ -690,6 +963,103 @@ impl<'a> TenantDownloader<'a> { Ok(()) } + + /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics + fn skip_layer(&self, layer: HeatMapLayer) { + let mut progress = self.secondary_state.progress.lock().unwrap(); + progress.layers_total = progress.layers_total.saturating_sub(1); + progress.bytes_total = progress + .bytes_total + .saturating_sub(layer.metadata.file_size); + } + + async fn download_layer( + &self, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + layer: HeatMapLayer, + ctx: &RequestContext, + ) -> Result, UpdateError> { + // Failpoint for simulating slow remote storage + failpoint_support::sleep_millis_async!( + "secondary-layer-download-sleep", + &self.secondary_state.cancel + ); + + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + timeline_id, + &layer.name, + &layer.metadata.generation, + ); + + // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally + tracing::info!( + "Starting download of layer {}, size {}", + layer.name, + layer.metadata.file_size + ); + let downloaded_bytes = download_layer_file( + self.conf, + self.remote_storage, + *tenant_shard_id, + *timeline_id, + &layer.name, + &layer.metadata, + &local_path, + &self.secondary_state.cancel, + ctx, + ) + .await; + + let downloaded_bytes = match downloaded_bytes { + Ok(bytes) => bytes, + Err(DownloadError::NotFound) => { + // A heatmap might be out of date and refer to a layer that doesn't exist any more. + // This is harmless: continue to download the next layer. It is expected during compaction + // GC. + tracing::debug!( + "Skipped downloading missing layer {}, raced with compaction/gc?", + layer.name + ); + self.skip_layer(layer); + + return Ok(None); + } + Err(e) => return Err(e.into()), + }; + + if downloaded_bytes != layer.metadata.file_size { + let local_path = local_layer_path( + self.conf, + tenant_shard_id, + timeline_id, + &layer.name, + &layer.metadata.generation, + ); + + tracing::warn!( + "Downloaded layer {} with unexpected size {} != {}. Removing download.", + layer.name, + downloaded_bytes, + layer.metadata.file_size + ); + + tokio::fs::remove_file(&local_path) + .await + .or_else(fs_ext::ignore_not_found)?; + } else { + tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes); + let mut progress = self.secondary_state.progress.lock().unwrap(); + progress.bytes_downloaded += downloaded_bytes; + progress.layers_downloaded += 1; + } + + SECONDARY_MODE.download_layer.inc(); + + Ok(Some(layer)) + } } /// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline @@ -721,7 +1091,7 @@ async fn init_timeline_state( // As we iterate through layers found on disk, we will look up their metadata from this map. // Layers not present in metadata will be discarded. - let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> = + let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> = heatmap.layers.iter().map(|l| (&l.name, l)).collect(); while let Some(dentry) = dir @@ -729,19 +1099,32 @@ async fn init_timeline_state( .await .fatal_err(&format!("Listing {timeline_path}")) { - let dentry_file_name = dentry.file_name(); - let file_name = dentry_file_name.to_string_lossy(); - let local_meta = dentry.metadata().await.fatal_err(&format!( - "Read metadata on {}", - dentry.path().to_string_lossy() - )); + let Ok(file_path) = Utf8PathBuf::from_path_buf(dentry.path()) else { + tracing::warn!("Malformed filename at {}", dentry.path().to_string_lossy()); + continue; + }; + let local_meta = dentry + .metadata() + .await + .fatal_err(&format!("Read metadata on {}", file_path)); - // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant. - if file_name == METADATA_FILE_NAME { + let file_name = file_path.file_name().expect("created it from the dentry"); + if crate::is_temporary(&file_path) + || is_temp_download_file(&file_path) + || is_ephemeral_file(file_name) + { + // Temporary files are frequently left behind from restarting during downloads + tracing::info!("Cleaning up temporary file {file_path}"); + if let Err(e) = tokio::fs::remove_file(&file_path) + .await + .or_else(fs_ext::ignore_not_found) + { + tracing::error!("Failed to remove temporary file {file_path}: {e}"); + } continue; } - match LayerFileName::from_str(&file_name) { + match LayerName::from_str(file_name) { Ok(name) => { let remote_meta = heatmap_metadata.get(&name); match remote_meta { @@ -766,8 +1149,9 @@ async fn init_timeline_state( tenant_shard_id, &heatmap.timeline_id, name, - LayerFileMetadata::from(&remote_meta.metadata), + remote_meta.metadata.clone(), remote_meta.access_time, + file_path, ), ); } diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs index 99aaaeb8c8..166483ba5d 100644 --- a/pageserver/src/tenant/secondary/heatmap.rs +++ b/pageserver/src/tenant/secondary/heatmap.rs @@ -1,8 +1,6 @@ use std::time::SystemTime; -use crate::tenant::{ - remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName, -}; +use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DisplayFromStr, TimestampSeconds}; @@ -17,6 +15,14 @@ pub(super) struct HeatMapTenant { pub(super) generation: Generation, pub(super) timelines: Vec, + + /// Uploaders provide their own upload period in the heatmap, as a hint to downloaders + /// of how frequently it is worthwhile to check for updates. + /// + /// This is optional for backward compat, and because we sometimes might upload + /// a heatmap explicitly via API for a tenant that has no periodic upload configured. + #[serde(default)] + pub(super) upload_period_ms: Option, } #[serde_as] @@ -31,8 +37,8 @@ pub(crate) struct HeatMapTimeline { #[serde_as] #[derive(Serialize, Deserialize)] pub(crate) struct HeatMapLayer { - pub(super) name: LayerFileName, - pub(super) metadata: IndexLayerMetadata, + pub(super) name: LayerName, + pub(super) metadata: LayerFileMetadata, #[serde_as(as = "TimestampSeconds")] pub(super) access_time: SystemTime, @@ -42,8 +48,8 @@ pub(crate) struct HeatMapLayer { impl HeatMapLayer { pub(crate) fn new( - name: LayerFileName, - metadata: IndexLayerMetadata, + name: LayerName, + metadata: LayerFileMetadata, access_time: SystemTime, ) -> Self { Self { @@ -62,3 +68,42 @@ impl HeatMapTimeline { } } } + +pub(crate) struct HeatMapStats { + pub(crate) bytes: u64, + pub(crate) layers: usize, +} + +impl HeatMapTenant { + pub(crate) fn get_stats(&self) -> HeatMapStats { + let mut stats = HeatMapStats { + bytes: 0, + layers: 0, + }; + for timeline in &self.timelines { + for layer in &timeline.layers { + stats.layers += 1; + stats.bytes += layer.metadata.file_size; + } + } + + stats + } + + pub(crate) fn strip_atimes(self) -> Self { + Self { + timelines: self + .timelines + .into_iter() + .map(|mut tl| { + for layer in &mut tl.layers { + layer.access_time = SystemTime::UNIX_EPOCH; + } + tl + }) + .collect(), + generation: self.generation, + upload_period_ms: self.upload_period_ms, + } + } +} diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index ef01c33e8e..9c7a9c4234 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -9,6 +9,7 @@ use crate::{ metrics::SECONDARY_MODE, tenant::{ config::AttachmentMode, + mgr::GetTenantError, mgr::TenantManager, remote_timeline_client::remote_heatmap_path, span::debug_assert_current_span_has_tenant_id, @@ -18,21 +19,21 @@ use crate::{ }; use futures::Future; -use md5; use pageserver_api::shard::TenantShardId; -use rand::Rng; -use remote_storage::GenericRemoteStorage; +use remote_storage::{GenericRemoteStorage, TimeoutOrCancel}; use super::{ - scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs}, - CommandRequest, + heatmap::HeatMapTenant, + scheduler::{ + self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, + TenantBackgroundJobs, + }, + CommandRequest, UploadCommand, }; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, Instrument}; use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop}; -use super::{heatmap::HeatMapTenant, UploadCommand}; - pub(super) async fn heatmap_uploader_task( tenant_manager: Arc, remote_storage: GenericRemoteStorage, @@ -52,7 +53,7 @@ pub(super) async fn heatmap_uploader_task( scheduler .run(command_queue, background_jobs_can_start, cancel) - .instrument(info_span!("heatmap_uploader")) + .instrument(info_span!("heatmap_upload_scheduler")) .await } @@ -79,7 +80,7 @@ impl RunningJob for WriteInProgress { struct UploadPending { tenant: Arc, - last_digest: Option, + last_upload: Option, target_time: Option, period: Option, } @@ -93,7 +94,7 @@ impl scheduler::PendingJob for UploadPending { struct WriteComplete { tenant_shard_id: TenantShardId, completed_at: Instant, - digest: Option, + uploaded: Option, next_upload: Option, } @@ -114,10 +115,7 @@ struct UploaderTenantState { tenant: Weak, /// Digest of the serialized heatmap that we last successfully uploaded - /// - /// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, - /// which is also an md5sum. - last_digest: Option, + last_upload_state: Option, /// When the last upload attempt completed (may have been successful or failed) last_upload: Option, @@ -134,7 +132,6 @@ type Scheduler = TenantBackgroundJobs< UploadCommand, >; -#[async_trait::async_trait] impl JobGenerator for HeatmapUploader { @@ -183,15 +180,11 @@ impl JobGenerator let state = self .tenants .entry(*tenant.get_tenant_shard_id()) - .or_insert_with(|| { - let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period); - - UploaderTenantState { - tenant: Arc::downgrade(&tenant), - last_upload: None, - next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)), - last_digest: None, - } + .or_insert_with(|| UploaderTenantState { + tenant: Arc::downgrade(&tenant), + last_upload: None, + next_upload: Some(now.checked_add(period_warmup(period)).unwrap_or(now)), + last_upload_state: None, }); // Decline to do the upload if insufficient time has passed @@ -199,10 +192,10 @@ impl JobGenerator return; } - let last_digest = state.last_digest; + let last_upload = state.last_upload_state.clone(); result.jobs.push(UploadPending { tenant, - last_digest, + last_upload, target_time: state.next_upload, period: Some(period), }); @@ -222,7 +215,7 @@ impl JobGenerator ) { let UploadPending { tenant, - last_digest, + last_upload, target_time, period, } = job; @@ -235,16 +228,16 @@ impl JobGenerator let _completion = completion; let started_at = Instant::now(); - let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await { - Ok(UploadHeatmapOutcome::Uploaded(digest)) => { + let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await { + Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => { let duration = Instant::now().duration_since(started_at); SECONDARY_MODE .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap.inc(); - Some(digest) + Some(uploaded) } - Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest, + Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload, Err(UploadHeatmapError::Upload(e)) => { tracing::warn!( "Failed to upload heatmap for tenant {}: {e:#}", @@ -255,11 +248,11 @@ impl JobGenerator .upload_heatmap_duration .observe(duration.as_secs_f64()); SECONDARY_MODE.upload_heatmap_errors.inc(); - last_digest + last_upload } Err(UploadHeatmapError::Cancelled) => { tracing::info!("Cancelled heatmap upload, shutting down"); - last_digest + last_upload } }; @@ -276,12 +269,12 @@ impl JobGenerator let next_upload = tenant .get_heatmap_period() - .and_then(|period| now.checked_add(period)); + .and_then(|period| now.checked_add(period_jitter(period, 5))); WriteComplete { tenant_shard_id: *tenant.get_tenant_shard_id(), completed_at: now, - digest, + uploaded, next_upload, } }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) @@ -295,12 +288,15 @@ impl JobGenerator "Starting heatmap write on command"); let tenant = self .tenant_manager - .get_attached_tenant_shard(*tenant_shard_id, true) + .get_attached_tenant_shard(*tenant_shard_id) .map_err(|e| anyhow::anyhow!(e))?; + if !tenant.is_active() { + return Err(GetTenantError::NotActive(*tenant_shard_id).into()); + } Ok(UploadPending { // Ignore our state for last digest: this forces an upload even if nothing has changed - last_digest: None, + last_upload: None, tenant, target_time: None, period: None, @@ -313,7 +309,7 @@ impl JobGenerator let WriteComplete { tenant_shard_id, completed_at, - digest, + uploaded, next_upload, } = completion; use std::collections::hash_map::Entry; @@ -323,7 +319,7 @@ impl JobGenerator } Entry::Occupied(mut entry) => { entry.get_mut().last_upload = Some(completed_at); - entry.get_mut().last_digest = digest; + entry.get_mut().last_upload_state = uploaded; entry.get_mut().next_upload = next_upload } } @@ -332,7 +328,7 @@ impl JobGenerator enum UploadHeatmapOutcome { /// We successfully wrote to remote storage, with this digest. - Uploaded(md5::Digest), + Uploaded(LastUploadState), /// We did not upload because the heatmap digest was unchanged since the last upload NoChange, /// We skipped the upload for some reason, such as tenant/timeline not ready @@ -348,12 +344,25 @@ enum UploadHeatmapError { Upload(#[from] anyhow::Error), } +/// Digests describing the heatmap we most recently uploaded successfully. +/// +/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag, +/// which is also an md5sum. +#[derive(Clone)] +struct LastUploadState { + // Digest of json-encoded HeatMapTenant + uploaded_digest: md5::Digest, + + // Digest without atimes set. + layers_only_digest: md5::Digest, +} + /// The inner upload operation. This will skip if `last_digest` is Some and matches the digest /// of the object we would have uploaded. async fn upload_tenant_heatmap( remote_storage: GenericRemoteStorage, tenant: &Arc, - last_digest: Option, + last_upload: Option, ) -> Result { debug_assert_current_span_has_tenant_id(); @@ -369,20 +378,16 @@ async fn upload_tenant_heatmap( let mut heatmap = HeatMapTenant { timelines: Vec::new(), generation, + upload_period_ms: tenant.get_heatmap_period().map(|p| p.as_millis()), }; let timelines = tenant.timelines.lock().unwrap().clone(); - let tenant_cancel = tenant.cancel.clone(); - // Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise // when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind // in remote storage. - let _guard = match tenant.gate.enter() { - Ok(g) => g, - Err(_) => { - tracing::info!("Skipping heatmap upload for tenant which is shutting down"); - return Err(UploadHeatmapError::Cancelled); - } + let Ok(_guard) = tenant.gate.enter() else { + tracing::info!("Skipping heatmap upload for tenant which is shutting down"); + return Err(UploadHeatmapError::Cancelled); }; for (timeline_id, timeline) in timelines { @@ -402,36 +407,54 @@ async fn upload_tenant_heatmap( // Serialize the heatmap let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?; - let size = bytes.len(); // Drop out early if nothing changed since our last upload let digest = md5::compute(&bytes); - if Some(digest) == last_digest { + if Some(&digest) == last_upload.as_ref().map(|d| &d.uploaded_digest) { return Ok(UploadHeatmapOutcome::NoChange); } + // Calculate a digest that omits atimes, so that we can distinguish actual changes in + // layers from changes only in atimes. + let heatmap_size_bytes = heatmap.get_stats().bytes; + let layers_only_bytes = + serde_json::to_vec(&heatmap.strip_atimes()).map_err(|e| anyhow::anyhow!(e))?; + let layers_only_digest = md5::compute(&layers_only_bytes); + if heatmap_size_bytes < tenant.get_checkpoint_distance() { + // For small tenants, skip upload if only atimes changed. This avoids doing frequent + // uploads from long-idle tenants whose atimes are just incremented by periodic + // size calculations. + if Some(&layers_only_digest) == last_upload.as_ref().map(|d| &d.layers_only_digest) { + return Ok(UploadHeatmapOutcome::NoChange); + } + } + + let bytes = bytes::Bytes::from(bytes); + let size = bytes.len(); + let path = remote_heatmap_path(tenant.get_tenant_shard_id()); - // Write the heatmap. + let cancel = &tenant.cancel; + tracing::debug!("Uploading {size} byte heatmap to {path}"); if let Err(e) = backoff::retry( || async { - let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from( - bytes.clone(), - )))); + let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); remote_storage - .upload_storage_object(bytes, size, &path) + .upload_storage_object(bytes, size, &path, cancel) .await }, - |_| false, + TimeoutOrCancel::caused_by_cancel, 3, u32::MAX, "Uploading heatmap", - backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")), + cancel, ) .await + .ok_or_else(|| anyhow::anyhow!("Shutting down")) + .and_then(|x| x) { - if tenant_cancel.is_cancelled() { + if cancel.is_cancelled() { return Err(UploadHeatmapError::Cancelled); } else { return Err(e.into()); @@ -440,5 +463,8 @@ async fn upload_tenant_heatmap( tracing::info!("Successfully uploaded {size} byte heatmap to {path}"); - Ok(UploadHeatmapOutcome::Uploaded(digest)) + Ok(UploadHeatmapOutcome::Uploaded(LastUploadState { + uploaded_digest: digest, + layers_only_digest, + })) } diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index cf01a100d9..28cf2125df 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -1,5 +1,5 @@ -use async_trait; use futures::Future; +use rand::Rng; use std::{ collections::HashMap, marker::PhantomData, @@ -20,6 +20,26 @@ use super::{CommandRequest, CommandResponse}; const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10); const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1); +/// Jitter a Duration by an integer percentage. Returned values are uniform +/// in the range 100-pct..100+pct (i.e. a 5% jitter is 5% either way: a ~10% range) +pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration { + if d == Duration::ZERO { + d + } else { + rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100) + } +} + +/// When a periodic task first starts, it should wait for some time in the range 0..period, so +/// that starting many such tasks at the same time spreads them across the time range. +pub(super) fn period_warmup(period: Duration) -> Duration { + if period == Duration::ZERO { + period + } else { + rand::thread_rng().gen_range(Duration::ZERO..period) + } +} + /// Scheduling helper for background work across many tenants. /// /// Systems that need to run background work across many tenants may use this type @@ -65,7 +85,6 @@ where _phantom: PhantomData<(PJ, RJ, C, CMD)>, } -#[async_trait::async_trait] pub(crate) trait JobGenerator where C: Completion, @@ -160,6 +179,13 @@ where // Schedule some work, if concurrency limit permits it self.spawn_pending(); + // This message is printed every scheduling iteration as proof of liveness when looking at logs + tracing::info!( + "Status: {} tasks running, {} pending", + self.running.len(), + self.pending.len() + ); + // Between scheduling iterations, we will: // - Drain any complete tasks and spawn pending tasks // - Handle incoming administrative commands @@ -239,7 +265,11 @@ where self.tasks.spawn(fut); - self.running.insert(tenant_shard_id, in_progress); + let replaced = self.running.insert(tenant_shard_id, in_progress); + debug_assert!(replaced.is_none()); + if replaced.is_some() { + tracing::warn!(%tenant_shard_id, "Unexpectedly spawned a task when one was already running") + } } /// For all pending tenants that are elegible for execution, spawn their task. @@ -249,7 +279,9 @@ where while !self.pending.is_empty() && self.running.len() < self.concurrency { // unwrap: loop condition includes !is_empty() let pending = self.pending.pop_front().unwrap(); - self.do_spawn(pending); + if !self.running.contains_key(pending.get_tenant_shard_id()) { + self.do_spawn(pending); + } } } @@ -302,6 +334,11 @@ where let tenant_shard_id = job.get_tenant_shard_id(); let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { + tracing::info!( + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug(), + "Command already running, waiting for it" + ); barrier } else { let running = self.spawn_now(job); diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index e0b1652d98..b2338b620e 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -3,7 +3,6 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use anyhow::{bail, Context}; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; @@ -11,7 +10,7 @@ use tokio_util::sync::CancellationToken; use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; -use super::{LogicalSizeCalculationCause, Tenant}; +use super::{GcError, LogicalSizeCalculationCause, Tenant}; use crate::tenant::Timeline; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -43,6 +42,40 @@ pub struct SegmentMeta { pub kind: LsnKind, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum CalculateSyntheticSizeError { + /// Something went wrong internally to the calculation of logical size at a particular branch point + #[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")] + LogicalSize { + timeline_id: TimelineId, + lsn: Lsn, + error: CalculateLogicalSizeError, + }, + + /// Something went wrong internally when calculating GC parameters at start of size calculation + #[error(transparent)] + GcInfo(GcError), + + /// Totally unexpected errors, like panics joining a task + #[error(transparent)] + Fatal(anyhow::Error), + + /// Tenant shut down while calculating size + #[error("Cancelled")] + Cancelled, +} + +impl From for CalculateSyntheticSizeError { + fn from(value: GcError) -> Self { + match value { + GcError::TenantCancelled | GcError::TimelineCancelled => { + CalculateSyntheticSizeError::Cancelled + } + other => CalculateSyntheticSizeError::GcInfo(other), + } + } +} + impl SegmentMeta { fn size_needed(&self) -> bool { match self.kind { @@ -116,12 +149,9 @@ pub(super) async fn gather_inputs( cause: LogicalSizeCalculationCause, cancel: &CancellationToken, ctx: &RequestContext, -) -> anyhow::Result { +) -> Result { // refresh is needed to update gc related pitr_cutoff and horizon_cutoff - tenant - .refresh_gc_info(cancel, ctx) - .await - .context("Failed to refresh gc_info before gathering inputs")?; + tenant.refresh_gc_info(cancel, ctx).await?; // Collect information about all the timelines let mut timelines = tenant.list_timelines(); @@ -183,7 +213,15 @@ pub(super) async fn gather_inputs( // new gc run, which we have no control over. however differently from `Timeline::gc` // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not // actually removing files. - let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); + // + // We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from + // a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather + // than a space bound (horizon cutoff). This means that if someone drops a database and waits for their + // PITR interval, they will see synthetic size decrease, even if we are still storing data inside + // horizon_cutoff. + let pitr_cutoff = gc_info.cutoffs.pitr; + let horizon_cutoff = gc_info.cutoffs.horizon; + let mut next_gc_cutoff = pitr_cutoff; // If the caller provided a shorter retention period, use that instead of the GC cutoff. let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { @@ -210,6 +248,8 @@ pub(super) async fn gather_inputs( .map(|lsn| (lsn, LsnKind::BranchPoint)) .collect::>(); + drop(gc_info); + // Add branch points we collected earlier, just in case there were any that were // not present in retain_lsns. We will remove any duplicates below later. if let Some(this_branchpoints) = branchpoints.get(&timeline_id) { @@ -288,8 +328,8 @@ pub(super) async fn gather_inputs( last_record: last_record_lsn, // this is not used above, because it might not have updated recently enough latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff: gc_info.horizon_cutoff, - pitr_cutoff: gc_info.pitr_cutoff, + horizon_cutoff, + pitr_cutoff, next_gc_cutoff, retention_param_cutoff, }); @@ -317,6 +357,12 @@ pub(super) async fn gather_inputs( ) .await?; + if tenant.cancel.is_cancelled() { + // If we're shutting down, return an error rather than a sparse result that might include some + // timelines from before we started shutting down + return Err(CalculateSyntheticSizeError::Cancelled); + } + Ok(ModelInputs { segments, timeline_inputs, @@ -325,9 +371,8 @@ pub(super) async fn gather_inputs( /// Augment 'segments' with logical sizes /// -/// this will probably conflict with on-demand downloaded layers, or at least force them all -/// to be downloaded -/// +/// This will leave segments' sizes as None if the Timeline associated with the segment is deleted concurrently +/// (i.e. we cannot read its logical size at a particular LSN). async fn fill_logical_sizes( timelines: &[Arc], segments: &mut [SegmentMeta], @@ -335,7 +380,7 @@ async fn fill_logical_sizes( logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, cause: LogicalSizeCalculationCause, ctx: &RequestContext, -) -> anyhow::Result<()> { +) -> Result<(), CalculateSyntheticSizeError> { let timeline_hash: HashMap> = HashMap::from_iter( timelines .iter() @@ -377,7 +422,7 @@ async fn fill_logical_sizes( } // Perform the size lookups - let mut have_any_error = false; + let mut have_any_error = None; while let Some(res) = joinset.join_next().await { // each of these come with Result, JoinError> // because of spawn + spawn_blocking @@ -388,21 +433,36 @@ async fn fill_logical_sizes( Err(join_error) => { // cannot really do anything, as this panic is likely a bug error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}"); - have_any_error = true; + + have_any_error = Some(CalculateSyntheticSizeError::Fatal( + anyhow::anyhow!(join_error) + .context("task that calls spawn_ondemand_logical_size_calculation"), + )); } Ok(Err(recv_result_error)) => { // cannot really do anything, as this panic is likely a bug error!("failed to receive logical size query result: {recv_result_error:#}"); - have_any_error = true; + have_any_error = Some(CalculateSyntheticSizeError::Fatal( + anyhow::anyhow!(recv_result_error) + .context("Receiving logical size query result"), + )); } Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => { - if !matches!(error, CalculateLogicalSizeError::Cancelled) { + if matches!(error, CalculateLogicalSizeError::Cancelled) { + // Skip this: it's okay if one timeline among many is shutting down while we + // calculate inputs for the overall tenant. + continue; + } else { warn!( timeline_id=%timeline.timeline_id, "failed to calculate logical size at {lsn}: {error:#}" ); + have_any_error = Some(CalculateSyntheticSizeError::LogicalSize { + timeline_id: timeline.timeline_id, + lsn, + error, + }); } - have_any_error = true; } Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => { debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); @@ -416,10 +476,10 @@ async fn fill_logical_sizes( // prune any keys not needed anymore; we record every used key and added key. logical_size_cache.retain(|key, _| sizes_needed.contains_key(key)); - if have_any_error { + if let Some(error) = have_any_error { // we cannot complete this round, because we are missing data. // we have however cached all we were able to request calculation on. - anyhow::bail!("failed to calculate some logical_sizes"); + return Err(error); } // Insert the looked up sizes to the Segments @@ -433,33 +493,28 @@ async fn fill_logical_sizes( if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) { seg.segment.size = Some(*size); - } else { - bail!("could not find size at {} in timeline {}", lsn, timeline_id); } } Ok(()) } impl ModelInputs { - pub fn calculate_model(&self) -> anyhow::Result { + pub fn calculate_model(&self) -> tenant_size_model::StorageModel { // Convert SegmentMetas into plain Segments - let storage = StorageModel { + StorageModel { segments: self .segments .iter() .map(|seg| seg.segment.clone()) .collect(), - }; - - Ok(storage) + } } // calculate total project size - pub fn calculate(&self) -> anyhow::Result { - let storage = self.calculate_model()?; + pub fn calculate(&self) -> u64 { + let storage = self.calculate_model(); let sizes = storage.calculate(); - - Ok(sizes.total_size) + sizes.total_size } } @@ -646,7 +701,7 @@ fn verify_size_for_multiple_branches() { "#; let inputs: ModelInputs = serde_json::from_str(doc).unwrap(); - assert_eq!(inputs.calculate().unwrap(), 37_851_408); + assert_eq!(inputs.calculate(), 37_851_408); } #[test] @@ -701,7 +756,7 @@ fn verify_size_for_one_branch() { let model: ModelInputs = serde_json::from_str(doc).unwrap(); - let res = model.calculate_model().unwrap().calculate(); + let res = model.calculate_model().calculate(); println!("calculated synthetic size: {}", res.total_size); println!("result: {:?}", serde_json::to_string(&res.segments)); diff --git a/pageserver/src/tenant/span.rs b/pageserver/src/tenant/span.rs deleted file mode 100644 index 04e92f4096..0000000000 --- a/pageserver/src/tenant/span.rs +++ /dev/null @@ -1,17 +0,0 @@ -#[cfg(debug_assertions)] -use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor}; - -#[cfg(not(debug_assertions))] -pub(crate) fn debug_assert_current_span_has_tenant_id() {} - -#[cfg(debug_assertions)] -pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"])); - -#[cfg(debug_assertions)] -#[track_caller] -pub(crate) fn debug_assert_current_span_has_tenant_id() { - if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) { - panic!("missing extractors: {missing:?}") - } -} diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 6e9a4932d8..9607546ce0 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -1,24 +1,31 @@ //! Common traits and structs for layers pub mod delta_layer; -mod filename; pub mod image_layer; -mod inmemory_layer; +pub(crate) mod inmemory_layer; pub(crate) mod layer; mod layer_desc; +mod layer_name; use crate::context::{AccessStatsBehavior, RequestContext}; +use crate::repository::Value; use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; use bytes::Bytes; use enum_map::EnumMap; use enumset::EnumSet; use once_cell::sync::Lazy; +use pageserver_api::key::Key; +use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; use pageserver_api::models::{ LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus, }; +use std::borrow::Cow; +use std::cmp::{Ordering, Reverse}; +use std::collections::hash_map::Entry; +use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; @@ -27,13 +34,18 @@ use utils::rate_limit::RateLimit; use utils::{id::TimelineId, lsn::Lsn}; pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; -pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; +pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; +use self::inmemory_layer::InMemoryLayerFileId; + +use super::timeline::GetVectoredError; +use super::PageReconstructError; + pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -61,12 +73,375 @@ where /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data' /// call, to collect more records. /// -#[derive(Debug)] +#[derive(Debug, Default)] pub struct ValueReconstructState { pub records: Vec<(Lsn, NeonWalRecord)>, pub img: Option<(Lsn, Bytes)>, } +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) enum ValueReconstructSituation { + Complete, + #[default] + Continue, +} + +/// Reconstruct data accumulated for a single key during a vectored get +#[derive(Debug, Default, Clone)] +pub(crate) struct VectoredValueReconstructState { + pub(crate) records: Vec<(Lsn, NeonWalRecord)>, + pub(crate) img: Option<(Lsn, Bytes)>, + + situation: ValueReconstructSituation, +} + +impl VectoredValueReconstructState { + fn get_cached_lsn(&self) -> Option { + self.img.as_ref().map(|img| img.0) + } +} + +impl From for ValueReconstructState { + fn from(mut state: VectoredValueReconstructState) -> Self { + // walredo expects the records to be descending in terms of Lsn + state.records.sort_by_key(|(lsn, _)| Reverse(*lsn)); + + ValueReconstructState { + records: state.records, + img: state.img, + } + } +} + +/// Bag of data accumulated during a vectored get.. +pub(crate) struct ValuesReconstructState { + /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline` + /// should not expect to get anything from this hashmap. + pub(crate) keys: HashMap>, + /// The keys which are already retrieved + keys_done: KeySpaceRandomAccum, + + /// The keys covered by the image layers + keys_with_image_coverage: Option>, + + // Statistics that are still accessible as a caller of `get_vectored_impl`. + layers_visited: u32, + delta_layers_visited: u32, +} + +impl ValuesReconstructState { + pub(crate) fn new() -> Self { + Self { + keys: HashMap::new(), + keys_done: KeySpaceRandomAccum::new(), + keys_with_image_coverage: None, + layers_visited: 0, + delta_layers_visited: 0, + } + } + + /// Associate a key with the error which it encountered and mark it as done + pub(crate) fn on_key_error(&mut self, key: Key, err: PageReconstructError) { + let previous = self.keys.insert(key, Err(err)); + if let Some(Ok(state)) = previous { + if state.situation == ValueReconstructSituation::Continue { + self.keys_done.add_key(key); + } + } + } + + pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) { + self.layers_visited += 1; + if let ReadableLayer::PersistentLayer(layer) = layer { + if layer.layer_desc().is_delta() { + self.delta_layers_visited += 1; + } + } + } + + pub(crate) fn get_delta_layers_visited(&self) -> u32 { + self.delta_layers_visited + } + + pub(crate) fn get_layers_visited(&self) -> u32 { + self.layers_visited + } + + /// This function is called after reading a keyspace from a layer. + /// It checks if the read path has now moved past the cached Lsn for any keys. + /// + /// Implementation note: We intentionally iterate over the keys for which we've + /// already collected some reconstruct data. This avoids scaling complexity with + /// the size of the search space. + pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) { + for (key, value) in self.keys.iter_mut() { + if !keyspace.contains(key) { + continue; + } + + if let Ok(state) = value { + if state.situation != ValueReconstructSituation::Complete + && state.get_cached_lsn() >= Some(advanced_to) + { + state.situation = ValueReconstructSituation::Complete; + self.keys_done.add_key(*key); + } + } + } + } + + /// On hitting image layer, we can mark all keys in this range as done, because + /// if the image layer does not contain a key, it is deleted/never added. + pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range) { + let prev_val = self.keys_with_image_coverage.replace(key_range.clone()); + assert_eq!( + prev_val, None, + "should consume the keyspace before the next iteration" + ); + } + + /// Update the state collected for a given key. + /// Returns true if this was the last value needed for the key and false otherwise. + /// + /// If the key is done after the update, mark it as such. + pub(crate) fn update_key( + &mut self, + key: &Key, + lsn: Lsn, + value: Value, + ) -> ValueReconstructSituation { + let state = self + .keys + .entry(*key) + .or_insert(Ok(VectoredValueReconstructState::default())); + + if let Ok(state) = state { + let key_done = match state.situation { + ValueReconstructSituation::Complete => unreachable!(), + ValueReconstructSituation::Continue => match value { + Value::Image(img) => { + state.img = Some((lsn, img)); + true + } + Value::WalRecord(rec) => { + debug_assert!( + Some(lsn) > state.get_cached_lsn(), + "Attempt to collect a record below cached LSN for walredo: {} < {}", + lsn, + state + .get_cached_lsn() + .expect("Assertion can only fire if a cached lsn is present") + ); + + let will_init = rec.will_init(); + state.records.push((lsn, rec)); + will_init + } + }, + }; + + if key_done && state.situation == ValueReconstructSituation::Continue { + state.situation = ValueReconstructSituation::Complete; + self.keys_done.add_key(*key); + } + + state.situation + } else { + ValueReconstructSituation::Complete + } + } + + /// Returns the Lsn at which this key is cached if one exists. + /// The read path should go no further than this Lsn for the given key. + pub(crate) fn get_cached_lsn(&self, key: &Key) -> Option { + self.keys + .get(key) + .and_then(|k| k.as_ref().ok()) + .and_then(|state| state.get_cached_lsn()) + } + + /// Returns the key space describing the keys that have + /// been marked as completed since the last call to this function. + /// Returns individual keys done, and the image layer coverage. + pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option>) { + ( + self.keys_done.consume_keyspace(), + self.keys_with_image_coverage.take(), + ) + } +} + +impl Default for ValuesReconstructState { + fn default() -> Self { + Self::new() + } +} + +/// A key that uniquely identifies a layer in a timeline +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub(crate) enum LayerId { + PersitentLayerId(PersistentLayerKey), + InMemoryLayerId(InMemoryLayerFileId), +} + +/// Layer wrapper for the read path. Note that it is valid +/// to use these layers even after external operations have +/// been performed on them (compaction, freeze, etc.). +#[derive(Debug)] +pub(crate) enum ReadableLayer { + PersistentLayer(Layer), + InMemoryLayer(Arc), +} + +/// A partial description of a read to be done. +#[derive(Debug, Clone)] +struct ReadDesc { + /// An id used to resolve the readable layer within the fringe + layer_id: LayerId, + /// Lsn range for the read, used for selecting the next read + lsn_range: Range, +} + +/// Data structure which maintains a fringe of layers for the +/// read path. The fringe is the set of layers which intersects +/// the current keyspace that the search is descending on. +/// Each layer tracks the keyspace that intersects it. +/// +/// The fringe must appear sorted by Lsn. Hence, it uses +/// a two layer indexing scheme. +#[derive(Debug)] +pub(crate) struct LayerFringe { + planned_reads_by_lsn: BinaryHeap, + layers: HashMap, +} + +#[derive(Debug)] +struct LayerKeyspace { + layer: ReadableLayer, + target_keyspace: KeySpaceRandomAccum, +} + +impl LayerFringe { + pub(crate) fn new() -> Self { + LayerFringe { + planned_reads_by_lsn: BinaryHeap::new(), + layers: HashMap::new(), + } + } + + pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { + let read_desc = match self.planned_reads_by_lsn.pop() { + Some(desc) => desc, + None => return None, + }; + + let removed = self.layers.remove_entry(&read_desc.layer_id); + + match removed { + Some(( + _, + LayerKeyspace { + layer, + mut target_keyspace, + }, + )) => Some(( + layer, + target_keyspace.consume_keyspace(), + read_desc.lsn_range, + )), + None => unreachable!("fringe internals are always consistent"), + } + } + + pub(crate) fn update( + &mut self, + layer: ReadableLayer, + keyspace: KeySpace, + lsn_range: Range, + ) { + let layer_id = layer.id(); + let entry = self.layers.entry(layer_id.clone()); + match entry { + Entry::Occupied(mut entry) => { + entry.get_mut().target_keyspace.add_keyspace(keyspace); + } + Entry::Vacant(entry) => { + self.planned_reads_by_lsn.push(ReadDesc { + lsn_range, + layer_id: layer_id.clone(), + }); + let mut accum = KeySpaceRandomAccum::new(); + accum.add_keyspace(keyspace); + entry.insert(LayerKeyspace { + layer, + target_keyspace: accum, + }); + } + } + } +} + +impl Default for LayerFringe { + fn default() -> Self { + Self::new() + } +} + +impl Ord for ReadDesc { + fn cmp(&self, other: &Self) -> Ordering { + let ord = self.lsn_range.end.cmp(&other.lsn_range.end); + if ord == std::cmp::Ordering::Equal { + self.lsn_range.start.cmp(&other.lsn_range.start).reverse() + } else { + ord + } + } +} + +impl PartialOrd for ReadDesc { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for ReadDesc { + fn eq(&self, other: &Self) -> bool { + self.lsn_range == other.lsn_range + } +} + +impl Eq for ReadDesc {} + +impl ReadableLayer { + pub(crate) fn id(&self) -> LayerId { + match self { + Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()), + Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()), + } + } + + pub(crate) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + lsn_range: Range, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + match self { + ReadableLayer::PersistentLayer(layer) => { + layer + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) + .await + } + ReadableLayer::InMemoryLayer(layer) => { + layer + .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx) + .await + } + } + } +} + /// Return value from [`Layer::get_value_reconstruct_data`] #[derive(Clone, Copy, Debug)] pub enum ValueReconstructResult { @@ -141,7 +516,7 @@ impl LayerAccessStatFullDetails { } = self; pageserver_api::models::LayerAccessStatFullDetails { when_millis_since_epoch: system_time_to_millis_since_epoch(when), - task_kind: task_kind.into(), // into static str, powered by strum_macros + task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros access_kind: *access_kind, } } @@ -239,7 +614,7 @@ impl LayerAccessStats { .collect(), task_kind_access_flag: task_kind_flag .iter() - .map(|task_kind| task_kind.into()) // into static str, powered by strum_macros + .map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros .collect(), first: first_access.as_ref().map(|a| a.as_api_model()), accesses_history: last_accesses.map(|m| m.as_api_model()), @@ -257,6 +632,12 @@ impl LayerAccessStats { ret } + /// Get the latest access timestamp, falling back to latest residence event, further falling + /// back to `SystemTime::now` for a usable timestamp for eviction. + pub(crate) fn latest_activity_or_now(&self) -> SystemTime { + self.latest_activity().unwrap_or_else(SystemTime::now) + } + /// Get the latest access timestamp, falling back to latest residence event. /// /// This function can only return `None` if there has not yet been a call to the @@ -271,7 +652,7 @@ impl LayerAccessStats { /// that that type can only be produced by inserting into the layer map. /// /// [`record_residence_event`]: Self::record_residence_event - pub(crate) fn latest_activity(&self) -> Option { + fn latest_activity(&self) -> Option { let locked = self.0.lock().unwrap(); let inner = &locked.for_eviction_policy; match inner.last_accesses.recent() { @@ -305,8 +686,8 @@ pub mod tests { use super::*; - impl From for PersistentLayerDesc { - fn from(value: DeltaFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: DeltaLayerName) -> Self { PersistentLayerDesc::new_delta( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -317,8 +698,8 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: ImageFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: ImageLayerName) -> Self { PersistentLayerDesc::new_img( TenantShardId::from([0; 18]), TimelineId::from_array([0; 16]), @@ -329,11 +710,11 @@ pub mod tests { } } - impl From for PersistentLayerDesc { - fn from(value: LayerFileName) -> Self { + impl From for PersistentLayerDesc { + fn from(value: LayerName) -> Self { match value { - LayerFileName::Delta(d) => Self::from(d), - LayerFileName::Image(i) => Self::from(i), + LayerName::Delta(d) => Self::from(d), + LayerName::Image(i) => Self::from(i), } } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 4ded6d6a8d..5e01ecd71d 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -20,8 +20,8 @@ //! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051 //! ``` //! -//! Every delta file consists of three parts: "summary", "index", and -//! "values". The summary is a fixed size header at the beginning of the file, +//! Every delta file consists of three parts: "summary", "values", and +//! "index". The summary is a fixed size header at the beginning of the file, //! and it contains basic information about the layer, and offsets to the other //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the //! "values" part. The actual page images and WAL records are stored in the @@ -29,18 +29,26 @@ //! use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; +use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; -use crate::tenant::Timeline; -use crate::virtual_file::VirtualFile; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::vectored_blob_io::{ + BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, +}; +use crate::tenant::{PageReconstructError, Timeline}; +use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; -use anyhow::{bail, ensure, Context, Result}; +use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; +use futures::StreamExt; +use itertools::Itertools; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; @@ -49,6 +57,7 @@ use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::fs::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; use tracing::*; @@ -59,7 +68,10 @@ use utils::{ lsn::Lsn, }; -use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer}; +use super::{ + AsLayerDesc, LayerAccessStats, LayerName, PersistentLayerDesc, ResidentLayer, + ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -208,8 +220,10 @@ pub struct DeltaLayerInner { index_start_blk: u32, index_root_blk: u32, - /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: VirtualFile, + file_id: FileId, + + max_vectored_read_bytes: Option, } impl std::fmt::Debug for DeltaLayerInner { @@ -291,18 +305,18 @@ impl DeltaLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result> { let path = self.path(); - let loaded = DeltaLayerInner::load(&path, None, ctx) + let loaded = DeltaLayerInner::load(&path, None, None, ctx) .await .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(Arc::new(loaded)) @@ -379,6 +393,7 @@ impl DeltaLayerWriterInner { tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. We don't know // the end key yet, so we cannot form the final filename yet. We will @@ -389,7 +404,7 @@ impl DeltaLayerWriterInner { let path = DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range); - let mut file = VirtualFile::create(&path).await?; + let mut file = VirtualFile::create(&path, ctx).await?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); @@ -415,28 +430,39 @@ impl DeltaLayerWriterInner { /// /// The values must be appended in key, lsn order. /// - async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { - self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init()) - .await + async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let (_, res) = self + .put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init(), ctx) + .await; + res } async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, - val: &[u8], + val: Vec, will_init: bool, - ) -> anyhow::Result<()> { + ctx: &RequestContext, + ) -> (Vec, anyhow::Result<()>) { assert!(self.lsn_range.start <= lsn); - - let off = self.blob_writer.write_blob(val).await?; + let (val, res) = self.blob_writer.write_blob(val, ctx).await; + let off = match res { + Ok(off) => off, + Err(e) => return (val, Err(anyhow::anyhow!(e))), + }; let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); - self.tree.append(&delta_key.0, blob_ref.0)?; - - Ok(()) + let res = self.tree.append(&delta_key.0, blob_ref.0); + (val, res.map_err(|e| anyhow::anyhow!(e))) } fn size(&self) -> u64 { @@ -446,18 +472,41 @@ impl DeltaLayerWriterInner { /// /// Finish writing the delta layer. /// - async fn finish(self, key_end: Key, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + key_end: Key, + timeline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result { + let temp_path = self.path.clone(); + let result = self.finish0(key_end, timeline, ctx).await; + if result.is_err() { + tracing::info!(%temp_path, "cleaning up temporary file after error during writing"); + if let Err(e) = std::fs::remove_file(&temp_path) { + tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing"); + } + } + result + } + + async fn finish0( + self, + key_end: Key, + timeline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; - let mut file = self.blob_writer.into_inner().await?; + let mut file = self.blob_writer.into_inner(ctx).await?; // Write out the index let (index_root_blk, block_buf) = self.tree.finish()?; file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64)) .await?; for buf in block_buf.blocks { - file.write_all(buf.as_ref()).await?; + let (_buf, res) = file.write_all(buf, ctx).await; + res?; } assert!(self.lsn_range.start < self.lsn_range.end); // Fill in the summary on blk 0 @@ -472,17 +521,12 @@ impl DeltaLayerWriterInner { index_root_blk, }; - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; - if buf.spilled() { - // This is bad as we only have one free block for the summary - warn!( - "Used more than one page size for summary buffer: {}", - buf.len() - ); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf, ctx).await; + res?; let metadata = file .metadata() @@ -559,6 +603,7 @@ impl DeltaLayerWriter { tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( @@ -568,6 +613,7 @@ impl DeltaLayerWriter { tenant_shard_id, key_start, lsn_range, + ctx, ) .await?, ), @@ -579,21 +625,32 @@ impl DeltaLayerWriter { /// /// The values must be appended in key, lsn order. /// - pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_value(key, lsn, val).await + pub async fn put_value( + &mut self, + key: Key, + lsn: Lsn, + val: Value, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.inner + .as_mut() + .unwrap() + .put_value(key, lsn, val, ctx) + .await } pub async fn put_value_bytes( &mut self, key: Key, lsn: Lsn, - val: &[u8], + val: Vec, will_init: bool, - ) -> anyhow::Result<()> { + ctx: &RequestContext, + ) -> (Vec, anyhow::Result<()>) { self.inner .as_mut() .unwrap() - .put_value_bytes(key, lsn, val, will_init) + .put_value_bytes(key, lsn, val, will_init, ctx) .await } @@ -608,8 +665,13 @@ impl DeltaLayerWriter { mut self, key_end: Key, timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { - self.inner.take().unwrap().finish(key_end, timeline).await + self.inner + .take() + .unwrap() + .finish(key_end, timeline, ctx) + .await } } @@ -647,34 +709,29 @@ impl DeltaLayer { where F: Fn(Summary) -> Summary, { - let file = VirtualFile::open_with_options( + let mut file = VirtualFile::open_with_options( path, - &*std::fs::OpenOptions::new().read(true).write(true), + virtual_file::OpenOptions::new().read(true).write(true), + ctx, ) .await .with_context(|| format!("Failed to open file '{}'", path))?; - let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; - let mut file = file.file; if actual_summary.magic != DELTA_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } let new_summary = rewrite(actual_summary); - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here, but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; - if buf.spilled() { - // The code in DeltaLayerWriterInner just warn!()s for this. - // It should probably error out as well. - return Err(RewriteSummaryError::Other(anyhow::anyhow!( - "Used more than one page size for summary buffer: {}", - buf.len() - ))); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf, ctx).await; + res?; Ok(()) } } @@ -686,15 +743,18 @@ impl DeltaLayerInner { pub(super) async fn load( path: &Utf8Path, summary: Option, + max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path).await { + let file = match VirtualFile::open(path, ctx).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; - let file = FileBlockReader::new(file); + let file_id = page_cache::next_file_id(); - let summary_blk = match file.read_blk(0, ctx).await { + let block_reader = FileBlockReader::new(&file, file_id); + + let summary_blk = match block_reader.read_blk(0, ctx).await { Ok(blk) => blk, Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), }; @@ -707,6 +767,9 @@ impl DeltaLayerInner { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; + // mask out the timeline_id, but still require the layers to be from the same tenant + expected_summary.timeline_id = actual_summary.timeline_id; + if actual_summary != expected_summary { bail!( "in-file summary does not match expected summary. actual = {:?} expected = {:?}", @@ -718,8 +781,10 @@ impl DeltaLayerInner { Ok(Ok(DeltaLayerInner { file, + file_id, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, + max_vectored_read_bytes, })) } @@ -732,11 +797,11 @@ impl DeltaLayerInner { ) -> anyhow::Result { let mut need_image = true; // Scan the page versions backwards, starting from `lsn`. - let file = &self.file; + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + &block_reader, ); let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); @@ -770,19 +835,19 @@ impl DeltaLayerInner { .build(); // Ok, 'offsets' now contains the offsets of all the entries we need to read - let cursor = file.block_cursor(); + let cursor = block_reader.block_cursor(); let mut buf = Vec::new(); for (entry_lsn, pos) in offsets { cursor .read_blob_into_buf(pos, &mut buf, ctx) .await .with_context(|| { - format!("Failed to read blob from virtual file {}", file.file.path) + format!("Failed to read blob from virtual file {}", self.file.path) })?; let val = Value::des(&buf).with_context(|| { format!( "Failed to deserialize file blob from virtual file {}", - file.file.path + self.file.path ) })?; match val { @@ -812,16 +877,276 @@ impl DeltaLayerInner { } } + // Look up the keys in the provided keyspace and update + // the reconstruct state with whatever is found. + // + // If the key is cached, go no further than the cached Lsn. + // + // Currently, the index is visited for each range, but this + // can be further optimised to visit the index only once. + pub(super) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + lsn_range: Range, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + + let planner = VectoredReadPlanner::new( + self.max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(), + ); + + let data_end_offset = self.index_start_offset(); + + let reads = Self::plan_reads( + &keyspace, + lsn_range.clone(), + data_end_offset, + index_reader, + planner, + reconstruct_state, + ctx, + ) + .await + .map_err(GetVectoredError::Other)?; + + self.do_reads_and_update_state(reads, reconstruct_state, ctx) + .await; + + reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start); + + Ok(()) + } + + /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. + #[cfg(test)] + pub(super) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + let mut result = Vec::new(); + let mut stream = + Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx)); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let cursor = block_reader.block_cursor(); + let mut buf = Vec::new(); + while let Some(item) = stream.next().await { + let (key, lsn, pos) = item?; + // TODO: dedup code with get_reconstruct_value + // TODO: ctx handling and sharding + cursor + .read_blob_into_buf(pos.pos(), &mut buf, ctx) + .await + .with_context(|| { + format!("Failed to read blob from virtual file {}", self.file.path) + })?; + let val = Value::des(&buf).with_context(|| { + format!( + "Failed to deserialize file blob from virtual file {}", + self.file.path + ) + })?; + result.push((key, lsn, val)); + } + Ok(result) + } + + async fn plan_reads( + keyspace: &KeySpace, + lsn_range: Range, + data_end_offset: u64, + index_reader: DiskBtreeReader, + mut planner: VectoredReadPlanner, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> anyhow::Result> + where + Reader: BlockReader, + { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::DeltaLayerBtreeNode) + .build(); + + for range in keyspace.ranges.iter() { + let mut range_end_handled = false; + + let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start); + let index_stream = index_reader.get_stream_from(&start_key.0, &ctx); + let mut index_stream = std::pin::pin!(index_stream); + + while let Some(index_entry) = index_stream.next().await { + let (raw_key, value) = index_entry?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); + let blob_ref = BlobRef(value); + + // Lsns are not monotonically increasing across keys, so we don't assert on them. + assert!(key >= range.start); + + let outside_lsn_range = !lsn_range.contains(&lsn); + let below_cached_lsn = reconstruct_state.get_cached_lsn(&key) >= Some(lsn); + + let flag = { + if outside_lsn_range || below_cached_lsn { + BlobFlag::Ignore + } else if blob_ref.will_init() { + BlobFlag::ReplaceAll + } else { + // Usual path: add blob to the read + BlobFlag::None + } + }; + + if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) { + planner.handle_range_end(blob_ref.pos()); + range_end_handled = true; + break; + } else { + planner.handle(key, lsn, blob_ref.pos(), flag); + } + } + + if !range_end_handled { + tracing::debug!("Handling range end fallback at {}", data_end_offset); + planner.handle_range_end(data_end_offset); + } + } + + Ok(planner.finish()) + } + + fn get_min_read_buffer_size( + planned_reads: &[VectoredRead], + read_size_soft_max: usize, + ) -> usize { + let Some(largest_read) = planned_reads.iter().max_by_key(|read| read.size()) else { + return read_size_soft_max; + }; + + let largest_read_size = largest_read.size(); + if largest_read_size > read_size_soft_max { + // If the read is oversized, it should only contain one key. + let offenders = largest_read + .blobs_at + .as_slice() + .iter() + .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .join(", "); + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + largest_read_size, + read_size_soft_max, + offenders + ); + } + + largest_read_size + } + + async fn do_reads_and_update_state( + &self, + reads: Vec, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) { + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + let mut ignore_key_with_err = None; + + let max_vectored_read_bytes = self + .max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(); + let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes); + let mut buf = Some(BytesMut::with_capacity(buf_size)); + + // Note that reads are processed in reverse order (from highest key+lsn). + // This is the order that `ReconstructState` requires such that it can + // track when a key is done. + for read in reads.into_iter().rev() { + let res = vectored_blob_reader + .read_blobs(&read, buf.take().expect("Should have a buffer"), ctx) + .await; + + let blobs_buf = match res { + Ok(blobs_buf) => blobs_buf, + Err(err) => { + let kind = err.kind(); + for (_, blob_meta) in read.blobs_at.as_slice() { + reconstruct_state.on_key_error( + blob_meta.key, + PageReconstructError::from(anyhow!( + "Failed to read blobs from virtual file {}: {}", + self.file.path, + kind + )), + ); + } + + // We have "lost" the buffer since the lower level IO api + // doesn't return the buffer on error. Allocate a new one. + buf = Some(BytesMut::with_capacity(buf_size)); + + continue; + } + }; + + for meta in blobs_buf.blobs.iter().rev() { + if Some(meta.meta.key) == ignore_key_with_err { + continue; + } + + let value = Value::des(&blobs_buf.buf[meta.start..meta.end]); + let value = match value { + Ok(v) => v, + Err(e) => { + reconstruct_state.on_key_error( + meta.meta.key, + PageReconstructError::from(anyhow!(e).context(format!( + "Failed to deserialize blob from virtual file {}", + self.file.path, + ))), + ); + + ignore_key_with_err = Some(meta.meta.key); + continue; + } + }; + + // Invariant: once a key reaches [`ValueReconstructSituation::Complete`] + // state, no further updates shall be made to it. The call below will + // panic if the invariant is violated. + reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value); + } + + buf = Some(blobs_buf.buf); + } + } + pub(super) async fn load_keys<'a>( &'a self, ctx: &RequestContext, ) -> Result>> { - let file = &self.file; - + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); let mut all_keys: Vec> = Vec::new(); @@ -862,29 +1187,224 @@ impl DeltaLayerInner { if let Some(last) = all_keys.last_mut() { // Last key occupies all space till end of value storage, // which corresponds to beginning of the index - last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size; + last.size = self.index_start_offset() - last.size; } Ok(all_keys) } + /// Using the given writer, write out a version which has the earlier Lsns than `until`. + /// + /// Return the amount of key value records pushed to the writer. + pub(super) async fn copy_prefix( + &self, + writer: &mut DeltaLayerWriter, + until: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { + use crate::tenant::vectored_blob_io::{ + BlobMeta, VectoredReadBuilder, VectoredReadExtended, + }; + use futures::stream::TryStreamExt; + + #[derive(Debug)] + enum Item { + Actual(Key, Lsn, BlobRef), + Sentinel, + } + + impl From for Option<(Key, Lsn, BlobRef)> { + fn from(value: Item) -> Self { + match value { + Item::Actual(key, lsn, blob) => Some((key, lsn, blob)), + Item::Sentinel => None, + } + } + } + + impl Item { + fn offset(&self) -> Option { + match self { + Item::Actual(_, _, blob) => Some(*blob), + Item::Sentinel => None, + } + } + + fn is_last(&self) -> bool { + matches!(self, Item::Sentinel) + } + } + + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + + let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx); + let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos)); + // put in a sentinel value for getting the end offset for last item, and not having to + // repeat the whole read part + let stream = stream.chain(futures::stream::once(futures::future::ready(Ok( + Item::Sentinel, + )))); + let mut stream = std::pin::pin!(stream); + + let mut prev: Option<(Key, Lsn, BlobRef)> = None; + + let mut read_builder: Option = None; + + let max_read_size = self + .max_vectored_read_bytes + .map(|x| x.0.get()) + .unwrap_or(8192); + + let mut buffer = Some(BytesMut::with_capacity(max_read_size)); + + // FIXME: buffering of DeltaLayerWriter + let mut per_blob_copy = Vec::new(); + + let mut records = 0; + + while let Some(item) = stream.try_next().await? { + tracing::debug!(?item, "popped"); + let offset = item + .offset() + .unwrap_or(BlobRef::new(self.index_start_offset(), false)); + + let actionable = if let Some((key, lsn, start_offset)) = prev.take() { + let end_offset = offset; + + Some((BlobMeta { key, lsn }, start_offset..end_offset)) + } else { + None + }; + + let is_last = item.is_last(); + + prev = Option::from(item); + + let actionable = actionable.filter(|x| x.0.lsn < until); + + let builder = if let Some((meta, offsets)) = actionable { + // extend or create a new builder + if read_builder + .as_mut() + .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta)) + .unwrap_or(VectoredReadExtended::No) + == VectoredReadExtended::Yes + { + None + } else { + read_builder.replace(VectoredReadBuilder::new( + offsets.start.pos(), + offsets.end.pos(), + meta, + max_read_size, + )) + } + } else { + // nothing to do, except perhaps flush any existing for the last element + None + }; + + // flush the possible older builder and also the new one if the item was the last one + let builders = builder.into_iter(); + let builders = if is_last { + builders.chain(read_builder.take()) + } else { + builders.chain(None) + }; + + for builder in builders { + let read = builder.build(); + + let reader = VectoredBlobReader::new(&self.file); + + let mut buf = buffer.take().unwrap(); + + buf.clear(); + buf.reserve(read.size()); + let res = reader.read_blobs(&read, buf, ctx).await?; + + for blob in res.blobs { + let key = blob.meta.key; + let lsn = blob.meta.lsn; + let data = &res.buf[blob.start..blob.end]; + + #[cfg(debug_assertions)] + Value::des(data) + .with_context(|| { + format!( + "blob failed to deserialize for {}@{}, {}..{}: {:?}", + blob.meta.key, + blob.meta.lsn, + blob.start, + blob.end, + utils::Hex(data) + ) + }) + .unwrap(); + + // is it an image or will_init walrecord? + // FIXME: this could be handled by threading the BlobRef to the + // VectoredReadBuilder + let will_init = crate::repository::ValueBytes::will_init(data) + .inspect_err(|_e| { + #[cfg(feature = "testing")] + tracing::error!(data=?utils::Hex(data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value"); + }) + .unwrap_or(false); + + per_blob_copy.clear(); + per_blob_copy.extend_from_slice(data); + + let (tmp, res) = writer + .put_value_bytes( + key, + lsn, + std::mem::take(&mut per_blob_copy), + will_init, + ctx, + ) + .await; + per_blob_copy = tmp; + + res?; + + records += 1; + } + + buffer = Some(res.buf); + } + } + + assert!( + read_builder.is_none(), + "with the sentinel above loop should had handled all" + ); + + Ok(records) + } + pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { println!( "index_start_blk: {}, root {}", self.index_start_blk, self.index_root_blk ); - let file = &self.file; + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); tree_reader.dump().await?; let keys = self.load_keys(ctx).await?; - async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { + async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result { let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; let val = Value::des(&buf)?; let desc = match val { @@ -906,17 +1426,73 @@ impl DeltaLayerInner { for entry in keys { let DeltaEntry { key, lsn, val, .. } = entry; - let desc = match dump_blob(val, ctx).await { + let desc = match dump_blob(&val, ctx).await { Ok(desc) => desc, Err(err) => { format!("ERROR: {err}") } }; println!(" key {key} at {lsn}: {desc}"); + + // Print more details about CHECKPOINT records. Would be nice to print details + // of many other record types too, but these are particularly interesting, as + // have a lot of special processing for them in walingest.rs. + use pageserver_api::key::CHECKPOINT_KEY; + use postgres_ffi::CheckPoint; + if key == CHECKPOINT_KEY { + let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?; + let val = Value::des(&buf)?; + match val { + Value::Image(img) => { + let checkpoint = CheckPoint::decode(&img)?; + println!(" CHECKPOINT: {:?}", checkpoint); + } + Value::WalRecord(_rec) => { + println!(" unexpected walrecord value for checkpoint key"); + } + } + } } Ok(()) } + + fn stream_index_forwards<'a, R>( + &'a self, + reader: &'a DiskBtreeReader, + start: &'a [u8; DELTA_KEY_SIZE], + ctx: &'a RequestContext, + ) -> impl futures::stream::Stream< + Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>, + > + 'a + where + R: BlockReader, + { + use futures::stream::TryStreamExt; + let stream = reader.get_stream_from(start, ctx); + stream.map_ok(|(key, value)| { + let key = DeltaKey::from_slice(&key); + let (key, lsn) = (key.key(), key.lsn()); + let offset = BlobRef(value); + + (key, lsn, offset) + }) + } + + /// The file offset to the first block of index. + /// + /// The file structure is summary, values, and index. We often need this for the size of last blob. + fn index_start_offset(&self) -> u64 { + let offset = self.index_start_blk as u64 * PAGE_SZ as u64; + let bref = BlobRef(offset); + tracing::debug!( + index_start_blk = self.index_start_blk, + offset, + pos = bref.pos(), + "index_start_offset" + ); + offset + } } /// A set of data associated with a delta layer key and its value @@ -953,7 +1529,8 @@ impl> Adapter { blknum: u32, ctx: &RequestContext, ) -> Result { - self.0.as_ref().file.read_blk(blknum, ctx).await + let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id); + block_reader.read_blk(blknum, ctx).await } } @@ -962,3 +1539,592 @@ impl AsRef for DeltaLayerInner { self } } + +impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> { + fn key(&self) -> Key { + self.key + } + fn lsn(&self) -> Lsn { + self.lsn + } + fn size(&self) -> u64 { + self.size + } +} + +#[cfg(test)] +mod test { + use std::collections::BTreeMap; + + use itertools::MinMaxResult; + use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use rand::RngCore; + + use super::*; + use crate::{ + context::DownloadBehavior, + task_mgr::TaskKind, + tenant::{disk_btree::tests::TestDisk, harness::TenantHarness}, + DEFAULT_PG_VERSION, + }; + + /// Construct an index for a fictional delta layer and and then + /// traverse in order to plan vectored reads for a query. Finally, + /// verify that the traversal fed the right index key and value + /// pairs into the planner. + #[tokio::test] + async fn test_delta_layer_index_traversal() { + let base_key = Key { + field1: 0, + field2: 1663, + field3: 12972, + field4: 16396, + field5: 0, + field6: 246080, + }; + + // Populate the index with some entries + let entries: BTreeMap> = BTreeMap::from([ + (base_key, vec![Lsn(1), Lsn(5), Lsn(25), Lsn(26), Lsn(28)]), + (base_key.add(1), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), + (base_key.add(2), vec![Lsn(2), Lsn(5), Lsn(10), Lsn(50)]), + (base_key.add(5), vec![Lsn(10), Lsn(15), Lsn(16), Lsn(20)]), + ]); + + let mut disk = TestDisk::default(); + let mut writer = DiskBtreeBuilder::<_, DELTA_KEY_SIZE>::new(&mut disk); + + let mut disk_offset = 0; + for (key, lsns) in &entries { + for lsn in lsns { + let index_key = DeltaKey::from_key_lsn(key, *lsn); + let blob_ref = BlobRef::new(disk_offset, false); + writer + .append(&index_key.0, blob_ref.0) + .expect("In memory disk append should never fail"); + + disk_offset += 1; + } + } + + // Prepare all the arguments for the call into `plan_reads` below + let (root_offset, _writer) = writer + .finish() + .expect("In memory disk finish should never fail"); + let reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(0, root_offset, disk); + let planner = VectoredReadPlanner::new(100); + let mut reconstruct_state = ValuesReconstructState::new(); + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let keyspace = KeySpace { + ranges: vec![ + base_key..base_key.add(3), + base_key.add(3)..base_key.add(100), + ], + }; + let lsn_range = Lsn(2)..Lsn(40); + + // Plan and validate + let vectored_reads = DeltaLayerInner::plan_reads( + &keyspace, + lsn_range.clone(), + disk_offset, + reader, + planner, + &mut reconstruct_state, + &ctx, + ) + .await + .expect("Read planning should not fail"); + + validate(keyspace, lsn_range, vectored_reads, entries); + } + + fn validate( + keyspace: KeySpace, + lsn_range: Range, + vectored_reads: Vec, + index_entries: BTreeMap>, + ) { + #[derive(Debug, PartialEq, Eq)] + struct BlobSpec { + key: Key, + lsn: Lsn, + at: u64, + } + + let mut planned_blobs = Vec::new(); + for read in vectored_reads { + for (at, meta) in read.blobs_at.as_slice() { + planned_blobs.push(BlobSpec { + key: meta.key, + lsn: meta.lsn, + at: *at, + }); + } + } + + let mut expected_blobs = Vec::new(); + let mut disk_offset = 0; + for (key, lsns) in index_entries { + for lsn in lsns { + let key_included = keyspace.ranges.iter().any(|range| range.contains(&key)); + let lsn_included = lsn_range.contains(&lsn); + + if key_included && lsn_included { + expected_blobs.push(BlobSpec { + key, + lsn, + at: disk_offset, + }); + } + + disk_offset += 1; + } + } + + assert_eq!(planned_blobs, expected_blobs); + } + + mod constants { + use utils::lsn::Lsn; + + /// Offset used by all lsns in this test + pub(super) const LSN_OFFSET: Lsn = Lsn(0x08); + /// Number of unique keys including in the test data + pub(super) const KEY_COUNT: u8 = 60; + /// Max number of different lsns for each key + pub(super) const MAX_ENTRIES_PER_KEY: u8 = 20; + /// Possible value sizes for each key along with a probability weight + pub(super) const VALUE_SIZES: [(usize, u8); 3] = [(100, 2), (1024, 2), (1024 * 1024, 1)]; + /// Probability that there will be a gap between the current key and the next one (33.3%) + pub(super) const KEY_GAP_CHANGES: [(bool, u8); 2] = [(true, 1), (false, 2)]; + /// The minimum size of a key range in all the generated reads + pub(super) const MIN_RANGE_SIZE: i128 = 10; + /// The number of ranges included in each vectored read + pub(super) const RANGES_COUNT: u8 = 2; + /// The number of vectored reads performed + pub(super) const READS_COUNT: u8 = 100; + /// Soft max size of a vectored read. Will be violated if we have to read keys + /// with values larger than the limit + pub(super) const MAX_VECTORED_READ_BYTES: usize = 64 * 1024; + } + + struct Entry { + key: Key, + lsn: Lsn, + value: Vec, + } + + fn generate_entries(rng: &mut StdRng) -> Vec { + let mut current_key = Key::MIN; + + let mut entries = Vec::new(); + for _ in 0..constants::KEY_COUNT { + let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY); + let mut lsns_iter = + std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| { + Some(Lsn(lsn.0 + 0x08)) + }); + let mut lsns = Vec::new(); + while lsns.len() < count as usize { + let take = rng.gen_bool(0.5); + let lsn = lsns_iter.next().unwrap(); + if take { + lsns.push(lsn); + } + } + + for lsn in lsns { + let size = constants::VALUE_SIZES + .choose_weighted(rng, |item| item.1) + .unwrap() + .0; + let mut buf = vec![0; size]; + rng.fill_bytes(&mut buf); + + entries.push(Entry { + key: current_key, + lsn, + value: buf, + }) + } + + let gap = constants::KEY_GAP_CHANGES + .choose_weighted(rng, |item| item.1) + .unwrap() + .0; + if gap { + current_key = current_key.add(2); + } else { + current_key = current_key.add(1); + } + } + + entries + } + + struct EntriesMeta { + key_range: Range, + lsn_range: Range, + index: BTreeMap<(Key, Lsn), Vec>, + } + + fn get_entries_meta(entries: &[Entry]) -> EntriesMeta { + let key_range = match entries.iter().minmax_by_key(|e| e.key) { + MinMaxResult::MinMax(min, max) => min.key..max.key.next(), + _ => panic!("More than one entry is always expected"), + }; + + let lsn_range = match entries.iter().minmax_by_key(|e| e.lsn) { + MinMaxResult::MinMax(min, max) => min.lsn..Lsn(max.lsn.0 + 1), + _ => panic!("More than one entry is always expected"), + }; + + let mut index = BTreeMap::new(); + for entry in entries.iter() { + index.insert((entry.key, entry.lsn), entry.value.clone()); + } + + EntriesMeta { + key_range, + lsn_range, + index, + } + } + + fn pick_random_keyspace(rng: &mut StdRng, key_range: &Range) -> KeySpace { + let start = key_range.start.to_i128(); + let end = key_range.end.to_i128(); + + let mut keyspace = KeySpace::default(); + + for _ in 0..constants::RANGES_COUNT { + let mut range: Option> = Option::default(); + while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) { + let range_start = rng.gen_range(start..end); + let range_end_offset = range_start + constants::MIN_RANGE_SIZE; + if range_end_offset >= end { + range = Some(Key::from_i128(range_start)..Key::from_i128(end)); + } else { + let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end); + range = Some(Key::from_i128(range_start)..Key::from_i128(range_end)); + } + } + keyspace.ranges.push(range.unwrap()); + } + + keyspace + } + + #[tokio::test] + async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?; + let (tenant, ctx) = harness.load().await; + + let timeline_id = TimelineId::generate(); + let timeline = tenant + .create_test_timeline(timeline_id, constants::LSN_OFFSET, DEFAULT_PG_VERSION, &ctx) + .await?; + + tracing::info!("Generating test data ..."); + + let rng = &mut StdRng::seed_from_u64(0); + let entries = generate_entries(rng); + let entries_meta = get_entries_meta(&entries); + + tracing::info!("Done generating {} entries", entries.len()); + + tracing::info!("Writing test data to delta layer ..."); + let mut writer = DeltaLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + entries_meta.key_range.start, + entries_meta.lsn_range.clone(), + &ctx, + ) + .await?; + + for entry in entries { + let (_, res) = writer + .put_value_bytes(entry.key, entry.lsn, entry.value, false, &ctx) + .await; + res?; + } + + let resident = writer + .finish(entries_meta.key_range.end, &timeline, &ctx) + .await?; + + let inner = resident.as_delta(&ctx).await?; + + let file_size = inner.file.metadata().await?.len(); + tracing::info!( + "Done writing test data to delta layer. Resulting file size is: {}", + file_size + ); + + for i in 0..constants::READS_COUNT { + tracing::info!("Doing vectored read {}/{}", i + 1, constants::READS_COUNT); + + let block_reader = FileBlockReader::new(&inner.file, inner.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + inner.index_start_blk, + inner.index_root_blk, + block_reader, + ); + + let planner = VectoredReadPlanner::new(constants::MAX_VECTORED_READ_BYTES); + let mut reconstruct_state = ValuesReconstructState::new(); + let keyspace = pick_random_keyspace(rng, &entries_meta.key_range); + let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64; + + let vectored_reads = DeltaLayerInner::plan_reads( + &keyspace, + entries_meta.lsn_range.clone(), + data_end_offset, + index_reader, + planner, + &mut reconstruct_state, + &ctx, + ) + .await?; + + let vectored_blob_reader = VectoredBlobReader::new(&inner.file); + let buf_size = DeltaLayerInner::get_min_read_buffer_size( + &vectored_reads, + constants::MAX_VECTORED_READ_BYTES, + ); + let mut buf = Some(BytesMut::with_capacity(buf_size)); + + for read in vectored_reads { + let blobs_buf = vectored_blob_reader + .read_blobs(&read, buf.take().expect("Should have a buffer"), &ctx) + .await?; + for meta in blobs_buf.blobs.iter() { + let value = &blobs_buf.buf[meta.start..meta.end]; + assert_eq!(value, entries_meta.index[&(meta.meta.key, meta.meta.lsn)]); + } + + buf = Some(blobs_buf.buf); + } + } + + Ok(()) + } + + #[tokio::test] + async fn copy_delta_prefix_smoke() { + use crate::walrecord::NeonWalRecord; + use bytes::Bytes; + + let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap(); + let (tenant, ctx) = h.load().await; + let ctx = &ctx; + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx) + .await + .unwrap(); + + let initdb_layer = timeline + .layers + .read() + .await + .likely_resident_layers() + .next() + .unwrap(); + + { + let mut writer = timeline.writer().await; + + let data = [ + (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))), + ( + 0x30, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: false, + rec: Bytes::from_static(b"1"), + }), + ), + ( + 0x40, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: Bytes::from_static(b"2"), + }), + ), + // build an oversized value so we cannot extend and existing read over + // this + ( + 0x50, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: { + let mut buf = + vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024]; + buf.iter_mut() + .enumerate() + .for_each(|(i, slot)| *slot = (i % 256) as u8); + Bytes::from(buf) + }, + }), + ), + // because the oversized read cannot be extended further, we are sure to exercise the + // builder created on the last round with this: + ( + 0x60, + 12, + Value::WalRecord(NeonWalRecord::Postgres { + will_init: true, + rec: Bytes::from_static(b"3"), + }), + ), + ( + 0x60, + 9, + Value::Image(Bytes::from_static(b"something for a different key")), + ), + ]; + + let mut last_lsn = None; + + for (lsn, key, value) in data { + let key = Key::from_i128(key); + writer.put(key, Lsn(lsn), &value, ctx).await.unwrap(); + last_lsn = Some(lsn); + } + + writer.finish_write(Lsn(last_lsn.unwrap())); + } + timeline.freeze_and_flush().await.unwrap(); + + let new_layer = timeline + .layers + .read() + .await + .likely_resident_layers() + .find(|x| x != &initdb_layer) + .unwrap(); + + // create a copy for the timeline, so we don't overwrite the file + let branch = tenant + .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx) + .await + .unwrap(); + + assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60)); + + // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just + // a single key + + for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] { + let truncate_at = Lsn(truncate_at); + + let mut writer = DeltaLayerWriter::new( + tenant.conf, + branch.timeline_id, + tenant.tenant_shard_id, + Key::MIN, + Lsn(0x11)..truncate_at, + ctx, + ) + .await + .unwrap(); + + let new_layer = new_layer.download_and_keep_resident().await.unwrap(); + + new_layer + .copy_delta_prefix(&mut writer, truncate_at, ctx) + .await + .unwrap(); + + let copied_layer = writer.finish(Key::MAX, &branch, ctx).await.unwrap(); + + copied_layer.as_delta(ctx).await.unwrap(); + + assert_keys_and_values_eq( + new_layer.as_delta(ctx).await.unwrap(), + copied_layer.as_delta(ctx).await.unwrap(), + truncate_at, + ctx, + ) + .await; + } + } + + async fn assert_keys_and_values_eq( + source: &DeltaLayerInner, + truncated: &DeltaLayerInner, + truncated_at: Lsn, + ctx: &RequestContext, + ) { + use futures::future::ready; + use futures::stream::TryStreamExt; + + let start_key = [0u8; DELTA_KEY_SIZE]; + + let source_reader = FileBlockReader::new(&source.file, source.file_id); + let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + source.index_start_blk, + source.index_root_blk, + &source_reader, + ); + let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx); + let source_stream = source_stream.filter(|res| match res { + Ok((_, lsn, _)) => ready(lsn < &truncated_at), + _ => ready(true), + }); + let mut source_stream = std::pin::pin!(source_stream); + + let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id); + let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + truncated.index_start_blk, + truncated.index_root_blk, + &truncated_reader, + ); + let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx); + let mut truncated_stream = std::pin::pin!(truncated_stream); + + let mut scratch_left = Vec::new(); + let mut scratch_right = Vec::new(); + + loop { + let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next()); + let (src, truncated) = tokio::try_join!(src, truncated).unwrap(); + + if src.is_none() { + assert!(truncated.is_none()); + break; + } + + let (src, truncated) = (src.unwrap(), truncated.unwrap()); + + // because we've filtered the source with Lsn, we should always have the same keys from both. + assert_eq!(src.0, truncated.0); + assert_eq!(src.1, truncated.1); + + // if this is needed for something else, just drop this assert. + assert!( + src.2.pos() >= truncated.2.pos(), + "value position should not go backwards {} vs. {}", + src.2.pos(), + truncated.2.pos() + ); + + scratch_left.clear(); + let src_cursor = source_reader.block_cursor(); + let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx); + scratch_right.clear(); + let trunc_cursor = truncated_reader.block_cursor(); + let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx); + + tokio::try_join!(left, right).unwrap(); + + assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right)); + } + } +} diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index f03c7642eb..06e2f09384 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -25,31 +25,39 @@ //! actual page images are stored in the "values" part. use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; -use crate::repository::{Key, KEY_SIZE}; +use crate::page_cache::{self, FileId, PAGE_SZ}; +use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::tenant::storage_layer::{ LayerAccessStats, ValueReconstructResult, ValueReconstructState, }; -use crate::tenant::Timeline; -use crate::virtual_file::VirtualFile; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::vectored_blob_io::{ + BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, +}; +use crate::tenant::{PageReconstructError, Timeline}; +use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; -use anyhow::{bail, ensure, Context, Result}; -use bytes::Bytes; +use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use hex; +use itertools::Itertools; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; -use pageserver_api::shard::TenantShardId; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; use std::os::unix::prelude::FileExt; +use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; +use tokio_stream::StreamExt; use tracing::*; use utils::{ @@ -58,8 +66,10 @@ use utils::{ lsn::Lsn, }; -use super::filename::ImageFileName; -use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer}; +use super::layer_name::ImageLayerName; +use super::{ + AsLayerDesc, Layer, LayerName, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, +}; /// /// Header stored in the beginning of the file @@ -148,10 +158,13 @@ pub struct ImageLayerInner { index_start_blk: u32, index_root_blk: u32, + key_range: Range, lsn: Lsn, - /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: VirtualFile, + file_id: FileId, + + max_vectored_read_bytes: Option, } impl std::fmt::Debug for ImageLayerInner { @@ -165,9 +178,12 @@ impl std::fmt::Debug for ImageLayerInner { impl ImageLayerInner { pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { - let file = &self.file; - let tree_reader = - DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); tree_reader.dump().await?; @@ -219,7 +235,7 @@ impl ImageLayer { conf: &PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - fname: &ImageFileName, + fname: &ImageLayerName, ) -> Utf8PathBuf { let rand_string: String = rand::thread_rng() .sample_iter(&Alphanumeric) @@ -250,18 +266,18 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx) + let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx) .await .and_then(|res| res)?; // not production code - let actual_filename = path.file_name().unwrap().to_owned(); - let expected_filename = self.layer_desc().filename().file_name(); + let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); + let expected_layer_name = self.layer_desc().layer_name(); - if actual_filename != expected_filename { + if actual_layer_name != expected_layer_name { println!("warning: filename does not match what is expected from in-file summary"); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + println!("actual: {:?}", actual_layer_name.to_string()); + println!("expected: {:?}", expected_layer_name.to_string()); } Ok(loaded) @@ -325,34 +341,29 @@ impl ImageLayer { where F: Fn(Summary) -> Summary, { - let file = VirtualFile::open_with_options( + let mut file = VirtualFile::open_with_options( path, - &*std::fs::OpenOptions::new().read(true).write(true), + virtual_file::OpenOptions::new().read(true).write(true), + ctx, ) .await .with_context(|| format!("Failed to open file '{}'", path))?; - let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; - let mut file = file.file; if actual_summary.magic != IMAGE_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } let new_summary = rewrite(actual_summary); - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&new_summary, &mut buf).context("serialize")?; - if buf.spilled() { - // The code in ImageLayerWriterInner just warn!()s for this. - // It should probably error out as well. - return Err(RewriteSummaryError::Other(anyhow::anyhow!( - "Used more than one page size for summary buffer: {}", - buf.len() - ))); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf, ctx).await; + res?; Ok(()) } } @@ -365,14 +376,16 @@ impl ImageLayerInner { path: &Utf8Path, lsn: Lsn, summary: Option, + max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { - let file = match VirtualFile::open(path).await { + let file = match VirtualFile::open(path, ctx).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; - let file = FileBlockReader::new(file); - let summary_blk = match file.read_blk(0, ctx).await { + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = match block_reader.read_blk(0, ctx).await { Ok(blk) => blk, Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), }; @@ -388,6 +401,8 @@ impl ImageLayerInner { // production code path expected_summary.index_start_blk = actual_summary.index_start_blk; expected_summary.index_root_blk = actual_summary.index_root_blk; + // mask out the timeline_id, but still require the layers to be from the same tenant + expected_summary.timeline_id = actual_summary.timeline_id; if actual_summary != expected_summary { bail!( @@ -403,6 +418,9 @@ impl ImageLayerInner { index_root_blk: actual_summary.index_root_blk, lsn, file, + file_id, + max_vectored_read_bytes, + key_range: actual_summary.key_range, })) } @@ -412,8 +430,9 @@ impl ImageLayerInner { reconstruct_state: &mut ValueReconstructState, ctx: &RequestContext, ) -> anyhow::Result { - let file = &self.file; - let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); @@ -426,7 +445,7 @@ impl ImageLayerInner { ) .await? { - let blob = file + let blob = block_reader .block_cursor() .read_blob( offset, @@ -444,6 +463,232 @@ impl ImageLayerInner { Ok(ValueReconstructResult::Missing) } } + + // Look up the keys in the provided keyspace and update + // the reconstruct state with whatever is found. + pub(super) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let reads = self + .plan_reads(keyspace, None, ctx) + .await + .map_err(GetVectoredError::Other)?; + + self.do_reads_and_update_state(reads, reconstruct_state, ctx) + .await; + + reconstruct_state.on_image_layer_visited(&self.key_range); + + Ok(()) + } + + /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. + #[cfg(test)] + pub(super) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); + let mut result = Vec::new(); + let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx)); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let cursor = block_reader.block_cursor(); + while let Some(item) = stream.next().await { + // TODO: dedup code with get_reconstruct_value + let (raw_key, offset) = item?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + // TODO: ctx handling and sharding + let blob = cursor + .read_blob(offset, ctx) + .await + .with_context(|| format!("failed to read value from offset {}", offset))?; + let value = Bytes::from(blob); + result.push((key, self.lsn, Value::Image(value))); + } + Ok(result) + } + + /// Traverse the layer's index to build read operations on the overlap of the input keyspace + /// and the keys in this layer. + /// + /// If shard_identity is provided, it will be used to filter keys down to those stored on + /// this shard. + async fn plan_reads( + &self, + keyspace: KeySpace, + shard_identity: Option<&ShardIdentity>, + ctx: &RequestContext, + ) -> anyhow::Result> { + let mut planner = VectoredReadPlanner::new( + self.max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(), + ); + + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); + + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::ImageLayerBtreeNode) + .build(); + + for range in keyspace.ranges.iter() { + let mut range_end_handled = false; + let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; + range.start.write_to_byte_slice(&mut search_key); + + let index_stream = tree_reader.get_stream_from(&search_key, &ctx); + let mut index_stream = std::pin::pin!(index_stream); + + while let Some(index_entry) = index_stream.next().await { + let (raw_key, offset) = index_entry?; + + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + assert!(key >= range.start); + + let flag = if let Some(shard_identity) = shard_identity { + if shard_identity.is_key_disposable(&key) { + BlobFlag::Ignore + } else { + BlobFlag::None + } + } else { + BlobFlag::None + }; + + if key >= range.end { + planner.handle_range_end(offset); + range_end_handled = true; + break; + } else { + planner.handle(key, self.lsn, offset, flag); + } + } + + if !range_end_handled { + let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64; + planner.handle_range_end(payload_end); + } + } + + Ok(planner.finish()) + } + + /// Given a key range, select the parts of that range that should be retained by the ShardIdentity, + /// then execute vectored GET operations, passing the results of all read keys into the writer. + pub(super) async fn filter( + &self, + shard_identity: &ShardIdentity, + writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + // Fragment the range into the regions owned by this ShardIdentity + let plan = self + .plan_reads( + KeySpace { + // If asked for the total key space, plan_reads will give us all the keys in the layer + ranges: vec![Key::MIN..Key::MAX], + }, + Some(shard_identity), + ctx, + ) + .await?; + + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + let mut key_count = 0; + for read in plan.into_iter() { + let buf_size = read.size(); + + let buf = BytesMut::with_capacity(buf_size); + let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?; + + let frozen_buf = blobs_buf.buf.freeze(); + + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + + key_count += 1; + writer + .put_image(meta.meta.key, img_buf, ctx) + .await + .context(format!("Storing key {}", meta.meta.key))?; + } + } + + Ok(key_count) + } + + async fn do_reads_and_update_state( + &self, + reads: Vec, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) { + let max_vectored_read_bytes = self + .max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(); + + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + for read in reads.into_iter() { + let buf_size = read.size(); + + if buf_size > max_vectored_read_bytes { + // If the read is oversized, it should only contain one key. + let offenders = read + .blobs_at + .as_slice() + .iter() + .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .join(", "); + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + buf_size, + max_vectored_read_bytes, + offenders + ); + } + + let buf = BytesMut::with_capacity(buf_size); + let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await; + + match res { + Ok(blobs_buf) => { + let frozen_buf = blobs_buf.buf.freeze(); + + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + reconstruct_state.update_key( + &meta.meta.key, + self.lsn, + Value::Image(img_buf), + ); + } + } + Err(err) => { + let kind = err.kind(); + for (_, blob_meta) in read.blobs_at.as_slice() { + reconstruct_state.on_key_error( + blob_meta.key, + PageReconstructError::from(anyhow!( + "Failed to read blobs from virtual file {}: {}", + self.file.path, + kind + )), + ); + } + } + }; + } + } } /// A builder object for constructing a new image layer. @@ -479,6 +724,7 @@ impl ImageLayerWriterInner { tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. // We'll atomically rename it to the final name when we're done. @@ -486,17 +732,22 @@ impl ImageLayerWriterInner { conf, timeline_id, tenant_shard_id, - &ImageFileName { + &ImageLayerName { key_range: key_range.clone(), lsn, }, ); - info!("new image layer {path}"); - let mut file = VirtualFile::open_with_options( - &path, - std::fs::OpenOptions::new().write(true).create_new(true), - ) - .await?; + trace!("creating image layer {}", path); + let mut file = { + VirtualFile::open_with_options( + &path, + virtual_file::OpenOptions::new() + .write(true) + .create_new(true), + ctx, + ) + .await? + }; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); @@ -524,9 +775,16 @@ impl ImageLayerWriterInner { /// /// The page versions must be appended in blknum order. /// - async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { + async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { ensure!(self.key_range.contains(&key)); - let off = self.blob_writer.write_blob(img).await?; + let (_img, res) = self.blob_writer.write_blob(img, ctx).await; + // TODO: re-use the buffer for `img` further upstack + let off = res?; let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); @@ -538,7 +796,11 @@ impl ImageLayerWriterInner { /// /// Finish writing the image layer. /// - async fn finish(self, timeline: &Arc) -> anyhow::Result { + async fn finish( + self, + timeline: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result { let index_start_blk = ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32; @@ -549,7 +811,8 @@ impl ImageLayerWriterInner { .await?; let (index_root_blk, block_buf) = self.tree.finish()?; for buf in block_buf.blocks { - file.write_all(buf.as_ref()).await?; + let (_buf, res) = file.write_all(buf, ctx).await; + res?; } // Fill in the summary on blk 0 @@ -564,17 +827,12 @@ impl ImageLayerWriterInner { index_root_blk, }; - let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new(); + let mut buf = Vec::with_capacity(PAGE_SZ); + // TODO: could use smallvec here but it's a pain with Slice Summary::ser_into(&summary, &mut buf)?; - if buf.spilled() { - // This is bad as we only have one free block for the summary - warn!( - "Used more than one page size for summary buffer: {}", - buf.len() - ); - } file.seek(SeekFrom::Start(0)).await?; - file.write_all(&buf).await?; + let (_buf, res) = file.write_all(buf, ctx).await; + res?; let metadata = file .metadata() @@ -599,7 +857,7 @@ impl ImageLayerWriterInner { // FIXME: why not carry the virtualfile here, it supports renaming? let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?; - trace!("created image layer {}", layer.local_path()); + info!("created image layer {}", layer.local_path()); Ok(layer) } @@ -641,10 +899,11 @@ impl ImageLayerWriter { tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( - ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn) + ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx) .await?, ), }) @@ -655,8 +914,13 @@ impl ImageLayerWriter { /// /// The page versions must be appended in blknum order. /// - pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> { - self.inner.as_mut().unwrap().put_image(key, img).await + pub async fn put_image( + &mut self, + key: Key, + img: Bytes, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.inner.as_mut().unwrap().put_image(key, img, ctx).await } /// @@ -665,8 +929,9 @@ impl ImageLayerWriter { pub(crate) async fn finish( mut self, timeline: &Arc, + ctx: &RequestContext, ) -> anyhow::Result { - self.inner.take().unwrap().finish(timeline).await + self.inner.take().unwrap().finish(timeline, ctx).await } } @@ -677,3 +942,196 @@ impl Drop for ImageLayerWriter { } } } + +#[cfg(test)] +mod test { + use std::time::Duration; + + use bytes::Bytes; + use pageserver_api::{ + key::Key, + shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, + }; + use utils::{ + generation::Generation, + id::{TenantId, TimelineId}, + lsn::Lsn, + }; + + use crate::{ + tenant::{config::TenantConf, harness::TenantHarness}, + DEFAULT_PG_VERSION, + }; + + use super::ImageLayerWriter; + + #[tokio::test] + async fn image_layer_rewrite() { + let tenant_conf = TenantConf { + gc_period: Duration::ZERO, + compaction_period: Duration::ZERO, + ..TenantConf::default() + }; + let tenant_id = TenantId::generate(); + let mut gen = Generation::new(0xdead0001); + let mut get_next_gen = || { + let ret = gen; + gen = gen.next(); + ret + }; + // The LSN at which we will create an image layer to filter + let lsn = Lsn(0xdeadbeef0000); + let timeline_id = TimelineId::generate(); + + // + // Create an unsharded parent with a layer. + // + + let harness = TenantHarness::create_custom( + "test_image_layer_rewrite--parent", + tenant_conf.clone(), + tenant_id, + ShardIdentity::unsharded(), + get_next_gen(), + ) + .unwrap(); + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero + let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let range = input_start..input_end; + + // Build an image layer to filter + let resident = { + let mut writer = ImageLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + &range, + lsn, + &ctx, + ) + .await + .unwrap(); + + let foo_img = Bytes::from_static(&[1, 2, 3, 4]); + let mut key = range.start; + while key < range.end { + writer.put_image(key, foo_img.clone(), &ctx).await.unwrap(); + + key = key.next(); + } + writer.finish(&timeline, &ctx).await.unwrap() + }; + let original_size = resident.metadata().file_size; + + // + // Create child shards and do the rewrite, exercising filter(). + // TODO: abstraction in TenantHarness for splits. + // + + // Filter for various shards: this exercises cases like values at start of key range, end of key + // range, middle of key range. + let shard_count = ShardCount::new(4); + for shard_number in 0..shard_count.count() { + // + // mimic the shard split + // + let shard_identity = ShardIdentity::new( + ShardNumber(shard_number), + shard_count, + ShardStripeSize(0x8000), + ) + .unwrap(); + let harness = TenantHarness::create_custom( + Box::leak(Box::new(format!( + "test_image_layer_rewrite--child{}", + shard_identity.shard_slug() + ))), + tenant_conf.clone(), + tenant_id, + shard_identity, + // NB: in reality, the shards would each fork off their own gen number sequence from the parent. + // But here, all we care about is that the gen number is unique. + get_next_gen(), + ) + .unwrap(); + let (tenant, ctx) = harness.load().await; + let timeline = tenant + .create_test_timeline(timeline_id, lsn, DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + // + // use filter() and make assertions + // + + let mut filtered_writer = ImageLayerWriter::new( + harness.conf, + timeline_id, + harness.tenant_shard_id, + &range, + lsn, + &ctx, + ) + .await + .unwrap(); + + let wrote_keys = resident + .filter(&shard_identity, &mut filtered_writer, &ctx) + .await + .unwrap(); + let replacement = if wrote_keys > 0 { + Some(filtered_writer.finish(&timeline, &ctx).await.unwrap()) + } else { + None + }; + + // This exact size and those below will need updating as/when the layer encoding changes, but + // should be deterministic for a given version of the format, as we used no randomness generating the input. + assert_eq!(original_size, 1597440); + + match shard_number { + 0 => { + // We should have written out just one stripe for our shard identity + assert_eq!(wrote_keys, 0x8000); + let replacement = replacement.unwrap(); + + // We should have dropped some of the data + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + + // Assert that we dropped ~3/4 of the data. + assert_eq!(replacement.metadata().file_size, 417792); + } + 1 => { + // Shard 1 has no keys in our input range + assert_eq!(wrote_keys, 0x0); + assert!(replacement.is_none()); + } + 2 => { + // Shard 2 has one stripes in the input range + assert_eq!(wrote_keys, 0x8000); + let replacement = replacement.unwrap(); + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + assert_eq!(replacement.metadata().file_size, 417792); + } + 3 => { + // Shard 3 has two stripes in the input range + assert_eq!(wrote_keys, 0x10000); + let replacement = replacement.unwrap(); + assert!(replacement.metadata().file_size < original_size); + assert!(replacement.metadata().file_size > 0); + assert_eq!(replacement.metadata().file_size, 811008); + } + _ => unreachable!(), + } + } + } +} diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 7c9103eea8..1ecc56ce99 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -9,28 +9,42 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::repository::{Key, Value}; use crate::tenant::block_io::BlockReader; use crate::tenant::ephemeral_file::EphemeralFile; -use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState}; -use crate::tenant::Timeline; -use crate::walrecord; -use anyhow::{ensure, Result}; +use crate::tenant::storage_layer::ValueReconstructResult; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::{PageReconstructError, Timeline}; +use crate::{page_cache, walrecord}; +use anyhow::{anyhow, ensure, Result}; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::HashMap; +use std::collections::{BTreeMap, BinaryHeap, HashSet}; use std::sync::{Arc, OnceLock}; +use std::time::Instant; use tracing::*; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap}; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods -use std::fmt::Write as _; +use crate::metrics::TIMELINE_EPHEMERAL_BYTES; +use std::cmp::Ordering; +use std::fmt::Write; use std::ops::Range; +use std::sync::atomic::Ordering as AtomicOrdering; +use std::sync::atomic::{AtomicU64, AtomicUsize}; use tokio::sync::{RwLock, RwLockWriteGuard}; -use super::{DeltaLayerWriter, ResidentLayer}; +use super::{ + DeltaLayerWriter, ResidentLayer, ValueReconstructSituation, ValueReconstructState, + ValuesReconstructState, +}; + +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] +pub(crate) struct InMemoryLayerFileId(page_cache::FileId); pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + file_id: InMemoryLayerFileId, /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. @@ -38,7 +52,15 @@ pub struct InMemoryLayer { /// Frozen layers have an exclusive end LSN. /// Writes are only allowed when this is `None`. - end_lsn: OnceLock, + pub(crate) end_lsn: OnceLock, + + /// Used for traversal path. Cached representation of the in-memory layer before frozen. + local_path_str: Arc, + + /// Used for traversal path. Cached representation of the in-memory layer after frozen. + frozen_local_path_str: OnceLock>, + + opened_at: Instant, /// The above fields never change, except for `end_lsn`, which is only set once. /// All other changing parts are in `inner`, and protected by a mutex. @@ -56,15 +78,17 @@ impl std::fmt::Debug for InMemoryLayer { } pub struct InMemoryLayerInner { - /// All versions of all pages in the layer are kept here. Indexed + /// All versions of all pages in the layer are kept here. Indexed /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. - index: HashMap>, + index: BTreeMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. /// PerSeg::page_versions map stores offsets into this file. file: EphemeralFile, + + resource_units: GlobalResourceUnits, } impl std::fmt::Debug for InMemoryLayerInner { @@ -73,7 +97,126 @@ impl std::fmt::Debug for InMemoryLayerInner { } } +/// State shared by all in-memory (ephemeral) layers. Updated infrequently during background ticks in Timeline, +/// to minimize contention. +/// +/// This global state is used to implement behaviors that require a global view of the system, e.g. +/// rolling layers proactively to limit the total amount of dirty data. +pub(crate) struct GlobalResources { + // Limit on how high dirty_bytes may grow before we start freezing layers to reduce it. + // Zero means unlimited. + pub(crate) max_dirty_bytes: AtomicU64, + // How many bytes are in all EphemeralFile objects + dirty_bytes: AtomicU64, + // How many layers are contributing to dirty_bytes + dirty_layers: AtomicUsize, +} + +// Per-timeline RAII struct for its contribution to [`GlobalResources`] +struct GlobalResourceUnits { + // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible + // for decrementing the global counter by this many bytes when dropped. + dirty_bytes: u64, +} + +impl GlobalResourceUnits { + // Hint for the layer append path to update us when the layer size differs from the last + // call to update_size by this much. If we don't reach this threshold, we'll still get + // updated when the Timeline "ticks" in the background. + const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024; + + fn new() -> Self { + GLOBAL_RESOURCES + .dirty_layers + .fetch_add(1, AtomicOrdering::Relaxed); + Self { dirty_bytes: 0 } + } + + /// Do not call this frequently: all timelines will write to these same global atomics, + /// so this is a relatively expensive operation. Wait at least a few seconds between calls. + /// + /// Returns the effective layer size limit that should be applied, if any, to keep + /// the total number of dirty bytes below the configured maximum. + fn publish_size(&mut self, size: u64) -> Option { + let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) { + Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed), + Ordering::Greater => { + let delta = size - self.dirty_bytes; + let old = GLOBAL_RESOURCES + .dirty_bytes + .fetch_add(delta, AtomicOrdering::Relaxed); + old + delta + } + Ordering::Less => { + let delta = self.dirty_bytes - size; + let old = GLOBAL_RESOURCES + .dirty_bytes + .fetch_sub(delta, AtomicOrdering::Relaxed); + old - delta + } + }; + + // This is a sloppy update: concurrent updates to the counter will race, and the exact + // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes. + // That's okay: as long as the metric contains some recent value, it doesn't have to always + // be literally the last update. + TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes); + + self.dirty_bytes = size; + + let max_dirty_bytes = GLOBAL_RESOURCES + .max_dirty_bytes + .load(AtomicOrdering::Relaxed); + if max_dirty_bytes > 0 && new_global_dirty_bytes > max_dirty_bytes { + // Set the layer file limit to the average layer size: this implies that all above-average + // sized layers will be elegible for freezing. They will be frozen in the order they + // next enter publish_size. + Some( + new_global_dirty_bytes + / GLOBAL_RESOURCES.dirty_layers.load(AtomicOrdering::Relaxed) as u64, + ) + } else { + None + } + } + + // Call publish_size if the input size differs from last published size by more than + // the drift limit + fn maybe_publish_size(&mut self, size: u64) { + let publish = match size.cmp(&self.dirty_bytes) { + Ordering::Equal => false, + Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT, + Ordering::Less => self.dirty_bytes - size > Self::MAX_SIZE_DRIFT, + }; + + if publish { + self.publish_size(size); + } + } +} + +impl Drop for GlobalResourceUnits { + fn drop(&mut self) { + GLOBAL_RESOURCES + .dirty_layers + .fetch_sub(1, AtomicOrdering::Relaxed); + + // Subtract our contribution to the global total dirty bytes + self.publish_size(0); + } +} + +pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources { + max_dirty_bytes: AtomicU64::new(0), + dirty_bytes: AtomicU64::new(0), + dirty_layers: AtomicUsize::new(0), +}; + impl InMemoryLayer { + pub(crate) fn file_id(&self) -> InMemoryLayerFileId { + self.file_id + } + pub(crate) fn get_timeline_id(&self) -> TimelineId { self.timeline_id } @@ -88,6 +231,10 @@ impl InMemoryLayer { } } + pub(crate) fn try_len(&self) -> Option { + self.inner.try_read().map(|i| i.file.len()).ok() + } + pub(crate) fn assert_writable(&self) { assert!(self.end_lsn.get().is_none()); } @@ -100,6 +247,12 @@ impl InMemoryLayer { self.start_lsn..self.end_lsn_or_max() } + pub(crate) fn local_path_str(&self) -> &Arc { + self.frozen_local_path_str + .get() + .unwrap_or(&self.local_path_str) + } + /// debugging function to print out the contents of the layer /// /// this is likely completly unused @@ -202,12 +355,108 @@ impl InMemoryLayer { Ok(ValueReconstructResult::Complete) } } + + // Look up the keys in the provided keyspace and update + // the reconstruct state with whatever is found. + // + // If the key is cached, go no further than the cached Lsn. + pub(crate) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + end_lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::InMemoryLayer) + .build(); + + let inner = self.inner.read().await; + let reader = inner.file.block_cursor(); + + #[derive(Eq, PartialEq, Ord, PartialOrd)] + struct BlockRead { + key: Key, + lsn: Lsn, + block_offset: u64, + } + + let mut planned_block_reads = BinaryHeap::new(); + + for range in keyspace.ranges.iter() { + for (key, vec_map) in inner.index.range(range.start..range.end) { + let lsn_range = match reconstruct_state.get_cached_lsn(key) { + Some(cached_lsn) => (cached_lsn + 1)..end_lsn, + None => self.start_lsn..end_lsn, + }; + + let slice = vec_map.slice_range(lsn_range); + for (entry_lsn, pos) in slice.iter().rev() { + planned_block_reads.push(BlockRead { + key: *key, + lsn: *entry_lsn, + block_offset: *pos, + }); + } + } + } + + let keyspace_size = keyspace.total_raw_size(); + + let mut completed_keys = HashSet::new(); + while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() { + let block_read = planned_block_reads.pop().unwrap(); + if completed_keys.contains(&block_read.key) { + continue; + } + + let buf = reader.read_blob(block_read.block_offset, &ctx).await; + if let Err(e) = buf { + reconstruct_state + .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); + completed_keys.insert(block_read.key); + continue; + } + + let value = Value::des(&buf.unwrap()); + if let Err(e) = value { + reconstruct_state + .on_key_error(block_read.key, PageReconstructError::from(anyhow!(e))); + completed_keys.insert(block_read.key); + continue; + } + + let key_situation = + reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap()); + if key_situation == ValueReconstructSituation::Complete { + completed_keys.insert(block_read.key); + } + } + + reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn); + + Ok(()) + } +} + +fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result { + write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0) +} + +fn inmem_layer_log_display( + mut f: impl Write, + timeline: TimelineId, + start_lsn: Lsn, + end_lsn: Lsn, +) -> std::fmt::Result { + write!(f, "timeline {} in-memory ", timeline)?; + inmem_layer_display(f, start_lsn, end_lsn) } impl std::fmt::Display for InMemoryLayer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let end_lsn = self.end_lsn_or_max(); - write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0) + inmem_layer_display(f, self.start_lsn, end_lsn) } } @@ -224,20 +473,31 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, + ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, ctx).await?; + let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { + file_id: key, + local_path_str: { + let mut buf = String::new(); + inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap(); + buf.into() + }, + frozen_local_path_str: OnceLock::new(), conf, timeline_id, tenant_shard_id, start_lsn, end_lsn: OnceLock::new(), + opened_at: Instant::now(), inner: RwLock::new(InMemoryLayerInner { - index: HashMap::new(), + index: BTreeMap::new(), file, + resource_units: GlobalResourceUnits::new(), }), }) } @@ -246,32 +506,17 @@ impl InMemoryLayer { /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree + pub(crate) async fn put_value( &self, key: Key, lsn: Lsn, - val: &Value, + buf: &[u8], ctx: &RequestContext, ) -> Result<()> { let mut inner = self.inner.write().await; self.assert_writable(); - self.put_value_locked(&mut inner, key, lsn, val, ctx).await - } - - pub(crate) async fn put_values( - &self, - values: &HashMap>, - ctx: &RequestContext, - ) -> Result<()> { - let mut inner = self.inner.write().await; - self.assert_writable(); - for (key, vals) in values { - for (lsn, val) in vals { - self.put_value_locked(&mut inner, *key, *lsn, val, ctx) - .await?; - } - } - Ok(()) + self.put_value_locked(&mut inner, key, lsn, buf, ctx).await } async fn put_value_locked( @@ -279,22 +524,16 @@ impl InMemoryLayer { locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>, key: Key, lsn: Lsn, - val: &Value, + buf: &[u8], ctx: &RequestContext, ) -> Result<()> { trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn); let off = { - // Avoid doing allocations for "small" values. - // In the regression test suite, the limit of 256 avoided allocations in 95% of cases: - // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061 - let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); - buf.clear(); - val.ser_into(&mut buf)?; locked_inner .file .write_blob( - &buf, + buf, &RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) .build(), @@ -309,9 +548,22 @@ impl InMemoryLayer { warn!("Key {} at {} already exists", key, lsn); } + let size = locked_inner.file.len(); + locked_inner.resource_units.maybe_publish_size(size); + Ok(()) } + pub(crate) fn get_opened_at(&self) -> Instant { + self.opened_at + } + + pub(crate) async fn tick(&self) -> Option { + let mut inner = self.inner.write().await; + let size = inner.file.len(); + inner.resource_units.publish_size(size) + } + pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range, Lsn)]) -> Result<()> { // TODO: Currently, we just leak the storage for any deleted keys Ok(()) @@ -322,9 +574,23 @@ impl InMemoryLayer { pub async fn freeze(&self, end_lsn: Lsn) { let inner = self.inner.write().await; - assert!(self.start_lsn < end_lsn); + assert!( + self.start_lsn < end_lsn, + "{} >= {}", + self.start_lsn, + end_lsn + ); self.end_lsn.set(end_lsn).expect("end_lsn set only once"); + self.frozen_local_path_str + .set({ + let mut buf = String::new(); + inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn) + .unwrap(); + buf.into() + }) + .expect("frozen_local_path_str set only once"); + for vec_map in inner.index.values() { for (lsn, _pos) in vec_map.as_slice() { assert!(*lsn < end_lsn); @@ -332,14 +598,17 @@ impl InMemoryLayer { } } - /// Write this frozen in-memory layer to disk. + /// Write this frozen in-memory layer to disk. If `key_range` is set, the delta + /// layer will only contain the key range the user specifies, and may return `None` + /// if there are no matching keys. /// /// Returns a new delta layer with all the same data as this in-memory layer pub(crate) async fn write_to_disk( &self, timeline: &Arc, ctx: &RequestContext, - ) -> Result { + key_range: Option>, + ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -353,12 +622,28 @@ impl InMemoryLayer { let end_lsn = *self.end_lsn.get().unwrap(); + let keys: Vec<_> = if let Some(key_range) = key_range { + inner + .index + .iter() + .filter(|(k, _)| key_range.contains(k)) + .map(|(k, m)| (k.to_i128(), m)) + .collect() + } else { + inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect() + }; + + if keys.is_empty() { + return Ok(None); + } + let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, Key::MIN, self.start_lsn..end_lsn, + ctx, ) .await?; @@ -366,31 +651,24 @@ impl InMemoryLayer { let cursor = inner.file.block_cursor(); - // Sort the keys because delta layer writer expects them sorted. - // - // NOTE: this sort can take up significant time if the layer has millions of - // keys. To speed up all the comparisons we convert the key to i128 and - // keep the value as a reference. - let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect(); - keys.sort_unstable_by_key(|k| k.0); - let ctx = RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::InMemoryLayer) .build(); - for (key, vec_map) in keys.iter() { - let key = Key::from_i128(*key); + for (key, vec_map) in inner.index.iter() { // Write all page versions for (lsn, pos) in vec_map.as_slice() { cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; let will_init = Value::des(&buf)?.will_init(); - delta_layer_writer - .put_value_bytes(key, *lsn, &buf, will_init) - .await?; + let res; + (buf, res) = delta_layer_writer + .put_value_bytes(*key, *lsn, buf, will_init, &ctx) + .await; + res?; } } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline).await?; - Ok(delta_layer) + let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?; + Ok(Some(delta_layer)) } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index f5adf9d639..32acb3f0cd 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1,31 +1,42 @@ use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; +use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{ HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus, }; -use pageserver_api::shard::ShardIndex; +use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId}; use std::ops::Range; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{Arc, Weak}; -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; use tracing::Instrument; +use utils::id::TimelineId; use utils::lsn::Lsn; -use utils::sync::heavier_once_cell; +use utils::sync::{gate, heavier_once_cell}; use crate::config::PageServerConf; -use crate::context::RequestContext; +use crate::context::{DownloadBehavior, RequestContext}; use crate::repository::Key; -use crate::tenant::{remote_timeline_client::LayerFileMetadata, RemoteTimelineClient, Timeline}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; +use crate::task_mgr::TaskKind; +use crate::tenant::timeline::GetVectoredError; +use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline}; use super::delta_layer::{self, DeltaEntry}; -use super::image_layer; +use super::image_layer::{self}; use super::{ - AsLayerDesc, LayerAccessStats, LayerAccessStatsReset, LayerFileName, PersistentLayerDesc, - ValueReconstructResult, ValueReconstructState, + AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName, + PersistentLayerDesc, ValueReconstructResult, ValueReconstructState, ValuesReconstructState, }; use utils::generation::Generation; +#[cfg(test)] +mod tests; + +#[cfg(test)] +mod failpoints; + /// A Layer contains all data in a "rectangle" consisting of a range of keys and /// range of LSNs. /// @@ -40,7 +51,41 @@ use utils::generation::Generation; /// An image layer is a snapshot of all the data in a key-range, at a single /// LSN. /// -/// This type models the on-disk layers, which can be evicted and on-demand downloaded. +/// This type models the on-disk layers, which can be evicted and on-demand downloaded. As a +/// general goal, read accesses should always win eviction and eviction should not wait for +/// download. +/// +/// ### State transitions +/// +/// The internal state of `Layer` is composed of most importantly the on-filesystem state and the +/// [`ResidentOrWantedEvicted`] enum. On-filesystem state can be either present (fully downloaded, +/// right size) or deleted. +/// +/// Reads will always win requests to evict until `wait_for_turn_and_evict` has acquired the +/// `heavier_once_cell::InitPermit` and has started to `evict_blocking`. Before the +/// `heavier_once_cell::InitPermit` has been acquired, any read request +/// (`get_or_maybe_download`) can "re-initialize" using the existing downloaded file and thus +/// cancelling the eviction. +/// +/// ```text +/// +-----------------+ get_or_maybe_download +--------------------------------+ +/// | not initialized |--------------------------->| Resident(Arc) | +/// | ENOENT | /->| | +/// +-----------------+ | +--------------------------------+ +/// ^ | | ^ +/// | get_or_maybe_download | | | get_or_maybe_download, either: +/// evict_blocking | /-------------------------/ | | - upgrade weak to strong +/// | | | | - re-initialize without download +/// | | evict_and_wait | | +/// +-----------------+ v | +/// | not initialized | on_downloaded_layer_drop +--------------------------------------+ +/// | file is present |<---------------------------| WantedEvicted(Weak) | +/// +-----------------+ +--------------------------------------+ +/// ``` +/// +/// ### Unsupported +/// +/// - Evicting by the operator deleting files from the filesystem /// /// [`InMemoryLayer`]: super::inmemory_layer::InMemoryLayer #[derive(Clone)] @@ -73,19 +118,50 @@ impl AsLayerDesc for Layer { } } +impl PartialEq for Layer { + fn eq(&self, other: &Self) -> bool { + Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0) + } +} + +pub(crate) fn local_layer_path( + conf: &PageServerConf, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + layer_file_name: &LayerName, + generation: &Generation, +) -> Utf8PathBuf { + let timeline_path = conf.timeline_path(tenant_shard_id, timeline_id); + + if generation.is_none() { + // Without a generation, we may only use legacy path style + timeline_path.join(layer_file_name.to_string()) + } else { + timeline_path.join(format!("{}-v1{}", layer_file_name, generation.get_suffix())) + } +} + impl Layer { /// Creates a layer value for a file we know to not be resident. pub(crate) fn for_evicted( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + file_name: LayerName, metadata: LayerFileMetadata, ) -> Self { + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &file_name, + &metadata.generation, + ); + let desc = PersistentLayerDesc::from_filename( timeline.tenant_shard_id, timeline.timeline_id, file_name, - metadata.file_size(), + metadata.file_size, ); let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted); @@ -93,6 +169,7 @@ impl Layer { let owner = Layer(Arc::new(LayerInner::new( conf, timeline, + local_path, access_stats, desc, None, @@ -109,14 +186,15 @@ impl Layer { pub(crate) fn for_resident( conf: &'static PageServerConf, timeline: &Arc, - file_name: LayerFileName, + local_path: Utf8PathBuf, + file_name: LayerName, metadata: LayerFileMetadata, ) -> ResidentLayer { let desc = PersistentLayerDesc::from_filename( timeline.tenant_shard_id, timeline.timeline_id, file_name, - metadata.file_size(), + metadata.file_size, ); let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident); @@ -134,6 +212,7 @@ impl Layer { LayerInner::new( conf, timeline, + local_path, access_stats, desc, Some(inner), @@ -148,7 +227,7 @@ impl Layer { timeline .metrics - .resident_physical_size_add(metadata.file_size()); + .resident_physical_size_add(metadata.file_size); ResidentLayer { downloaded, owner } } @@ -175,9 +254,19 @@ impl Layer { LayerResidenceStatus::Resident, LayerResidenceEventReason::LayerCreate, ); + + let local_path = local_layer_path( + conf, + &timeline.tenant_shard_id, + &timeline.timeline_id, + &desc.layer_name(), + &timeline.generation, + ); + LayerInner::new( conf, timeline, + local_path, access_stats, desc, Some(inner), @@ -188,8 +277,10 @@ impl Layer { let downloaded = resident.expect("just initialized"); - // if the rename works, the path is as expected - std::fs::rename(temp_path, owner.local_path()) + // We never want to overwrite an existing file, so we use `RENAME_NOREPLACE`. + // TODO: this leaves the temp file in place if the rename fails, risking us running + // out of space. Should we clean it up here or does the calling context deal with this? + utils::fs_ext::rename_noreplace(temp_path.as_std_path(), owner.local_path().as_std_path()) .with_context(|| format!("rename temporary file as correct path for {owner}"))?; Ok(ResidentLayer { downloaded, owner }) @@ -202,19 +293,20 @@ impl Layer { /// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is /// re-downloaded, [`EvictionError::Downloaded`] is returned. /// + /// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction + /// will happen regardless the future returned by this method completing unless there is a + /// read access before eviction gets to complete. + /// /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation /// of download-evict cycle on retry. - pub(crate) async fn evict_and_wait( - &self, - rtc: &RemoteTimelineClient, - ) -> Result<(), EvictionError> { - self.0.evict_and_wait(rtc).await + pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { + self.0.evict_and_wait(timeout).await } /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload /// then. /// - /// On drop, this will cause a call to [`RemoteTimelineClient::schedule_deletion_of_unlinked`]. + /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`]. /// This means that the unlinking by [gc] or [compaction] must have happened strictly before /// the value this is called on gets dropped. /// @@ -264,6 +356,55 @@ impl Layer { .with_context(|| format!("get_value_reconstruct_data for layer {self}")) } + pub(crate) async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + lsn_range: Range, + reconstruct_data: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let layer = self + .0 + .get_or_maybe_download(true, Some(ctx)) + .await + .map_err(|err| match err { + DownloadError::DownloadCancelled => GetVectoredError::Cancelled, + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; + + self.0 + .access_stats + .record_access(LayerAccessKind::GetValueReconstructData, ctx); + + layer + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) + .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) + .await + .map_err(|err| match err { + GetVectoredError::Other(err) => GetVectoredError::Other( + err.context(format!("get_values_reconstruct_data for layer {self}")), + ), + err => err, + }) + } + + /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future. + #[cfg(test)] + pub(crate) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let layer = self + .0 + .get_or_maybe_download(true, Some(ctx)) + .await + .map_err(|err| match err { + DownloadError::DownloadCancelled => GetVectoredError::Cancelled, + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; + layer.load_key_values(&self.0, ctx).await + } + /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. @@ -275,21 +416,28 @@ impl Layer { /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction /// while the guard exists. /// - /// Returns None if the layer is currently evicted. - pub(crate) async fn keep_resident(&self) -> anyhow::Result> { - let downloaded = match self.0.get_or_maybe_download(false, None).await { - Ok(d) => d, - // technically there are a lot of possible errors, but in practice it should only be - // DownloadRequired which is tripped up. could work to improve this situation - // statically later. - Err(DownloadError::DownloadRequired) => return Ok(None), - Err(e) => return Err(e.into()), - }; + /// Returns None if the layer is currently evicted or becoming evicted. + #[cfg(test)] + pub(crate) async fn keep_resident(&self) -> Option { + let downloaded = self.0.inner.get().and_then(|rowe| rowe.get())?; - Ok(Some(ResidentLayer { + Some(ResidentLayer { downloaded, owner: self.clone(), - })) + }) + } + + /// Weak indicator of is the layer resident or not. Good enough for eviction, which can deal + /// with `EvictionError::NotFound`. + /// + /// Returns `true` if this layer might be resident, or `false`, if it most likely evicted or + /// will be unless a read happens soon. + pub(crate) fn is_likely_resident(&self) -> bool { + self.0 + .inner + .get() + .map(|rowe| rowe.is_likely_resident()) + .unwrap_or(false) } /// Downloads if necessary and creates a guard, which will keep this layer from being evicted. @@ -314,10 +462,21 @@ impl Layer { &self.0.path } + pub(crate) fn debug_str(&self) -> &Arc { + &self.0.debug_str + } + pub(crate) fn metadata(&self) -> LayerFileMetadata { self.0.metadata() } + pub(crate) fn get_timeline_id(&self) -> Option { + self.0 + .timeline + .upgrade() + .map(|timeline| timeline.timeline_id) + } + /// Traditional debug dumping facility #[allow(unused)] pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> anyhow::Result<()> { @@ -337,13 +496,13 @@ impl Layer { /// /// Does not start local deletion, use [`Self::delete_on_drop`] for that /// separatedly. - #[cfg(feature = "testing")] + #[cfg(any(feature = "testing", test))] pub(crate) fn wait_drop(&self) -> impl std::future::Future + 'static { - let mut rx = self.0.status.subscribe(); + let mut rx = self.0.status.as_ref().unwrap().subscribe(); async move { loop { - if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await { + if rx.changed().await.is_err() { break; } } @@ -365,6 +524,32 @@ enum ResidentOrWantedEvicted { } impl ResidentOrWantedEvicted { + /// Non-mutating access to the a DownloadedLayer, if possible. + /// + /// This is not used on the read path (anything that calls + /// [`LayerInner::get_or_maybe_download`]) because it was decided that reads always win + /// evictions, and part of that winning is using [`ResidentOrWantedEvicted::get_and_upgrade`]. + #[cfg(test)] + fn get(&self) -> Option> { + match self { + ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()), + ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.upgrade(), + } + } + + /// Best-effort query for residency right now, not as strong guarantee as receiving a strong + /// reference from `ResidentOrWantedEvicted::get`. + fn is_likely_resident(&self) -> bool { + match self { + ResidentOrWantedEvicted::Resident(_) => true, + ResidentOrWantedEvicted::WantedEvicted(weak, _) => weak.strong_count() > 0, + } + } + + /// Upgrades any weak to strong if possible. + /// + /// Returns a strong reference if possible, along with a boolean telling if an upgrade + /// happened. fn get_and_upgrade(&mut self) -> Option<(Arc, bool)> { match self { ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)), @@ -385,7 +570,7 @@ impl ResidentOrWantedEvicted { /// /// Returns `Some` if this was the first time eviction was requested. Care should be taken to /// drop the possibly last strong reference outside of the mutex of - /// heavier_once_cell::OnceCell. + /// [`heavier_once_cell::OnceCell`]. fn downgrade(&mut self) -> Option> { match self { ResidentOrWantedEvicted::Resident(strong) => { @@ -410,38 +595,52 @@ struct LayerInner { /// Full path to the file; unclear if this should exist anymore. path: Utf8PathBuf, + /// String representation of the layer, used for traversal id. + debug_str: Arc, + desc: PersistentLayerDesc, /// Timeline access is needed for remote timeline client and metrics. + /// + /// There should not be an access to timeline for any reason without entering the + /// [`Timeline::gate`] at the same time. timeline: Weak, - /// Cached knowledge of [`Timeline::remote_client`] being `Some`. - have_remote_client: bool, - access_stats: LayerAccessStats, /// This custom OnceCell is backed by std mutex, but only held for short time periods. - /// Initialization and deinitialization are done while holding a permit. + /// + /// Filesystem changes (download, evict) are only done while holding a permit which the + /// `heavier_once_cell` provides. + /// + /// A number of fields in `Layer` are meant to only be updated when holding the InitPermit, but + /// possibly read while not holding it. inner: heavier_once_cell::OnceCell, /// Do we want to delete locally and remotely this when `LayerInner` is dropped wanted_deleted: AtomicBool, - /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses - /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger - /// [`LayerInner::on_downloaded_layer_drop`]. - wanted_evicted: AtomicBool, - - /// Version is to make sure we will only evict a specific download of a file. + /// Version is to make sure we will only evict a specific initialization of the downloaded file. /// - /// Incremented for each download, stored in `DownloadedLayer::version` or + /// Incremented for each initialization, stored in `DownloadedLayer::version` or /// `ResidentOrWantedEvicted::WantedEvicted`. version: AtomicUsize, - /// Allow subscribing to when the layer actually gets evicted. - status: tokio::sync::broadcast::Sender, + /// Allow subscribing to when the layer actually gets evicted, a non-cancellable download + /// starts, or completes. + /// + /// Updates must only be posted while holding the InitPermit or the heavier_once_cell::Guard. + /// Holding the InitPermit is the only time we can do state transitions, but we also need to + /// cancel a pending eviction on upgrading a [`ResidentOrWantedEvicted::WantedEvicted`] back to + /// [`ResidentOrWantedEvicted::Resident`] on access. + /// + /// The sender is wrapped in an Option to facilitate moving it out on [`LayerInner::drop`]. + status: Option>, - /// Counter for exponential backoff with the download + /// Counter for exponential backoff with the download. + /// + /// This is atomic only for the purposes of having additional data only accessed while holding + /// the InitPermit. consecutive_failures: AtomicUsize, /// The generation of this Layer. @@ -459,7 +658,13 @@ struct LayerInner { /// a shard split since the layer was originally written. shard: ShardIndex, + /// When the Layer was last evicted but has not been downloaded since. + /// + /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`]. last_evicted_at: std::sync::Mutex>, + + #[cfg(test)] + failpoints: std::sync::Mutex>, } impl std::fmt::Display for LayerInner { @@ -476,34 +681,54 @@ impl AsLayerDesc for LayerInner { #[derive(Debug, Clone, Copy)] enum Status { + Resident, Evicted, - Downloaded, + Downloading, } impl Drop for LayerInner { fn drop(&mut self) { + // if there was a pending eviction, mark it cancelled here to balance metrics + if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit() + { + // eviction has already been started + LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); + + // eviction request is intentionally not honored as no one is present to wait for it + // and we could be delaying shutdown for nothing. + } + if !*self.wanted_deleted.get_mut() { - // should we try to evict if the last wish was for eviction? - // feels like there's some hazard of overcrowding near shutdown near by, but we don't - // run drops during shutdown (yet) return; } let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id); let path = std::mem::take(&mut self.path); - let file_name = self.layer_desc().filename(); + let file_name = self.layer_desc().layer_name(); let file_size = self.layer_desc().file_size; let timeline = self.timeline.clone(); let meta = self.metadata(); - let status = self.status.clone(); + let status = self.status.take(); - crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { + Self::spawn_blocking(move || { let _g = span.entered(); // carry this until we are finished for [`Layer::wait_drop`] support let _status = status; + let Some(timeline) = timeline.upgrade() else { + // no need to nag that timeline is gone: under normal situation on + // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped. + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); + return; + }; + + let Ok(_guard) = timeline.gate.enter() else { + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); + return; + }; + let removed = match std::fs::remove_file(path) { Ok(()) => true, Err(e) if e.kind() == std::io::ErrorKind::NotFound => { @@ -522,75 +747,74 @@ impl Drop for LayerInner { } }; - if let Some(timeline) = timeline.upgrade() { - if removed { - timeline.metrics.resident_physical_size_sub(file_size); - } - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]); + if removed { + timeline.metrics.resident_physical_size_sub(file_size); + } + let res = timeline + .remote_client + .schedule_deletion_of_unlinked(vec![(file_name, meta)]); - if let Err(e) = res { - // test_timeline_deletion_with_files_stuck_in_upload_queue is good at - // demonstrating this deadlock (without spawn_blocking): stop will drop - // queued items, which will have ResidentLayer's, and those drops would try - // to re-entrantly lock the RemoteTimelineClient inner state. - if !timeline.is_active() { - tracing::info!("scheduling deletion on drop failed: {e:#}"); - } else { - tracing::warn!("scheduling deletion on drop failed: {e:#}"); - } - LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); - } else { - LAYER_IMPL_METRICS.inc_completed_deletes(); - } + if let Err(e) = res { + // test_timeline_deletion_with_files_stuck_in_upload_queue is good at + // demonstrating this deadlock (without spawn_blocking): stop will drop + // queued items, which will have ResidentLayer's, and those drops would try + // to re-entrantly lock the RemoteTimelineClient inner state. + if !timeline.is_active() { + tracing::info!("scheduling deletion on drop failed: {e:#}"); + } else { + tracing::warn!("scheduling deletion on drop failed: {e:#}"); } + LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed); } else { - // no need to nag that timeline is gone: under normal situation on - // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped. - LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); + LAYER_IMPL_METRICS.inc_completed_deletes(); } }); } } impl LayerInner { + #[allow(clippy::too_many_arguments)] fn new( conf: &'static PageServerConf, timeline: &Arc, + local_path: Utf8PathBuf, access_stats: LayerAccessStats, desc: PersistentLayerDesc, downloaded: Option>, generation: Generation, shard: ShardIndex, ) -> Self { - let path = conf - .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id) - .join(desc.filename().to_string()); - - let (inner, version) = if let Some(inner) = downloaded { + let (inner, version, init_status) = if let Some(inner) = downloaded { let version = inner.version; let resident = ResidentOrWantedEvicted::Resident(inner); - (heavier_once_cell::OnceCell::new(resident), version) + ( + heavier_once_cell::OnceCell::new(resident), + version, + Status::Resident, + ) } else { - (heavier_once_cell::OnceCell::default(), 0) + (heavier_once_cell::OnceCell::default(), 0, Status::Evicted) }; LayerInner { conf, - path, + debug_str: { + format!("timelines/{}/{}", timeline.timeline_id, desc.layer_name()).into() + }, + path: local_path, desc, timeline: Arc::downgrade(timeline), - have_remote_client: timeline.remote_client.is_some(), access_stats, wanted_deleted: AtomicBool::new(false), - wanted_evicted: AtomicBool::new(false), inner, version: AtomicUsize::new(version), - status: tokio::sync::broadcast::channel(1).0, + status: Some(tokio::sync::watch::channel(init_status).0), consecutive_failures: AtomicUsize::new(0), generation, shard, last_evicted_at: std::sync::Mutex::default(), + #[cfg(test)] + failpoints: Default::default(), } } @@ -606,51 +830,67 @@ impl LayerInner { /// Cancellation safe, however dropping the future and calling this method again might result /// in a new attempt to evict OR join the previously started attempt. - pub(crate) async fn evict_and_wait( - &self, - _: &RemoteTimelineClient, - ) -> Result<(), EvictionError> { - use tokio::sync::broadcast::error::RecvError; + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, ret, err(level = tracing::Level::DEBUG), fields(layer=%self))] + pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> { + let mut rx = self.status.as_ref().unwrap().subscribe(); - assert!(self.have_remote_client); - - let mut rx = self.status.subscribe(); + { + let current = rx.borrow_and_update(); + match &*current { + Status::Resident => { + // we might get lucky and evict this; continue + } + Status::Evicted | Status::Downloading => { + // it is already evicted + return Err(EvictionError::NotFound); + } + } + } let strong = { match self.inner.get() { - Some(mut either) => { - self.wanted_evicted.store(true, Ordering::Relaxed); - either.downgrade() + Some(mut either) => either.downgrade(), + None => { + // we already have a scheduled eviction, which just has not gotten to run yet. + // it might still race with a read access, but that could also get cancelled, + // so let's say this is not evictable. + return Err(EvictionError::NotFound); } - None => return Err(EvictionError::NotFound), } }; if strong.is_some() { // drop the DownloadedLayer outside of the holding the guard drop(strong); + + // idea here is that only one evicter should ever get to witness a strong reference, + // which means whenever get_or_maybe_download upgrades a weak, it must mark up a + // cancelled eviction and signal us, like it currently does. + // + // a second concurrent evict_and_wait will not see a strong reference. LAYER_IMPL_METRICS.inc_started_evictions(); } - match rx.recv().await { - Ok(Status::Evicted) => Ok(()), - Ok(Status::Downloaded) => Err(EvictionError::Downloaded), - Err(RecvError::Closed) => { - unreachable!("sender cannot be dropped while we are in &self method") - } - Err(RecvError::Lagged(_)) => { - // this is quite unlikely, but we are blocking a lot in the async context, so - // we might be missing this because we are stuck on a LIFO slot on a thread - // which is busy blocking for a 1TB database create_image_layers. - // - // use however late (compared to the initial expressing of wanted) as the - // "outcome" now - LAYER_IMPL_METRICS.inc_broadcast_lagged(); - match self.inner.get() { - Some(_) => Err(EvictionError::Downloaded), - None => Ok(()), - } - } + let changed = rx.changed(); + let changed = tokio::time::timeout(timeout, changed).await; + + let Ok(changed) = changed else { + return Err(EvictionError::Timeout); + }; + + let _: () = changed.expect("cannot be closed, because we are holding a strong reference"); + + let current = rx.borrow_and_update(); + + match &*current { + // the easiest case + Status::Evicted => Ok(()), + // it surely was evicted in between, but then there was a new access now; we can't know + // if it'll succeed so lets just call it evicted + Status::Downloading => Ok(()), + // either the download which was started after eviction completed already, or it was + // never evicted + Status::Resident => Err(EvictionError::Downloaded), } } @@ -660,154 +900,122 @@ impl LayerInner { allow_download: bool, ctx: Option<&RequestContext>, ) -> Result, DownloadError> { - let mut init_permit = None; + let (weak, permit) = { + // get_or_init_detached can: + // - be fast (mutex lock) OR uncontested semaphore permit acquire + // - be slow (wait for semaphore permit or closing) + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - loop { - let download = move |permit| { - async move { - // disable any scheduled but not yet running eviction deletions for this - let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + let locked = self + .inner + .get_or_init_detached() + .await + .map(|mut guard| guard.get_and_upgrade().ok_or(guard)); - // count cancellations, which currently remain largely unexpected - let init_cancelled = - scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + scopeguard::ScopeGuard::into_inner(init_cancelled); - // no need to make the evict_and_wait wait for the actual download to complete - drop(self.status.send(Status::Downloaded)); - - let timeline = self - .timeline - .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; - - // FIXME: grab a gate - - let can_ever_evict = timeline.remote_client.as_ref().is_some(); - - // check if we really need to be downloaded; could have been already downloaded by a - // cancelled previous attempt. - let needs_download = self - .needs_download() - .await - .map_err(DownloadError::PreStatFailed)?; - - let permit = if let Some(reason) = needs_download { - if let NeedsDownload::NotFile(ft) = reason { - return Err(DownloadError::NotFile(ft)); - } - - // only reset this after we've decided we really need to download. otherwise it'd - // be impossible to mark cancelled downloads for eviction, like one could imagine - // we would like to do for prefetching which was not needed. - self.wanted_evicted.store(false, Ordering::Release); - - if !can_ever_evict { - return Err(DownloadError::NoRemoteStorage); - } - - if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; - } - - if !allow_download { - // this does look weird, but for LayerInner the "downloading" means also changing - // internal once related state ... - return Err(DownloadError::DownloadRequired); - } - - tracing::info!(%reason, "downloading on-demand"); - - self.spawn_download_and_wait(timeline, permit).await? - } else { - // the file is present locally, probably by a previous but cancelled call to - // get_or_maybe_download. alternatively we might be running without remote storage. - LAYER_IMPL_METRICS.inc_init_needed_no_download(); - - permit - }; - - let since_last_eviction = - self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); - if let Some(since_last_eviction) = since_last_eviction { - // FIXME: this will not always be recorded correctly until #6028 (the no - // download needed branch above) - LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); - } - - let res = Arc::new(DownloadedLayer { - owner: Arc::downgrade(self), - kind: tokio::sync::OnceCell::default(), - version: next_version, - }); - - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); - - let waiters = self.inner.initializer_count(); - if waiters > 0 { - tracing::info!( - waiters, - "completing the on-demand download for other tasks" - ); - } - - scopeguard::ScopeGuard::into_inner(init_cancelled); - - Ok((ResidentOrWantedEvicted::Resident(res), permit)) - } - .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) - }; - - if let Some(init_permit) = init_permit.take() { - // use the already held initialization permit because it is impossible to hit the - // below paths anymore essentially limiting the max loop iterations to 2. - let (value, init_permit) = download(init_permit).await?; - let mut guard = self.inner.set(value, init_permit); - let (strong, _upgraded) = guard - .get_and_upgrade() - .expect("init creates strong reference, we held the init permit"); - return Ok(strong); - } - - let (weak, permit) = { - let mut locked = self.inner.get_or_init(download).await?; - - if let Some((strong, upgraded)) = locked.get_and_upgrade() { - if upgraded { - // when upgraded back, the Arc is still available, but - // previously a `evict_and_wait` was received. - self.wanted_evicted.store(false, Ordering::Relaxed); - - // error out any `evict_and_wait` - drop(self.status.send(Status::Downloaded)); - LAYER_IMPL_METRICS - .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); - } + match locked { + // this path could had been a RwLock::read + Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong), + Ok(Ok((strong, _))) => { + // when upgraded back, the Arc is still available, but + // previously a `evict_and_wait` was received. this is the only place when we + // send out an update without holding the InitPermit. + // + // note that we also have dropped the Guard; this is fine, because we just made + // a state change and are holding a strong reference to be returned. + self.status.as_ref().unwrap().send_replace(Status::Resident); + LAYER_IMPL_METRICS + .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); return Ok(strong); - } else { - // path to here: the evict_blocking is stuck on spawn_blocking queue. - // - // reset the contents, deactivating the eviction and causing a - // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed. - locked.take_and_deinit() } - }; - - // unlock first, then drop the weak, but because upgrade failed, we - // know it cannot be a problem. + Ok(Err(guard)) => { + // path to here: we won the eviction, the file should still be on the disk. + let (weak, permit) = guard.take_and_deinit(); + (Some(weak), permit) + } + Err(permit) => (None, permit), + } + }; + if let Some(weak) = weak { + // only drop the weak after dropping the heavier_once_cell guard assert!( matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)), "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug" ); - - init_permit = Some(permit); - - LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download(); } + + let timeline = self + .timeline + .upgrade() + .ok_or_else(|| DownloadError::TimelineShutdown)?; + + // count cancellations, which currently remain largely unexpected + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + + // check if we really need to be downloaded: this can happen if a read access won the + // semaphore before eviction. + // + // if we are cancelled while doing this `stat` the `self.inner` will be uninitialized. a + // pending eviction will try to evict even upon finding an uninitialized `self.inner`. + let needs_download = self + .needs_download() + .await + .map_err(DownloadError::PreStatFailed); + + scopeguard::ScopeGuard::into_inner(init_cancelled); + + let needs_download = needs_download?; + + let Some(reason) = needs_download else { + // the file is present locally because eviction has not had a chance to run yet + + #[cfg(test)] + self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload) + .await?; + + LAYER_IMPL_METRICS.inc_init_needed_no_download(); + + return Ok(self.initialize_after_layer_is_on_disk(permit)); + }; + + // we must download; getting cancelled before spawning the download is not an issue as + // any still running eviction would not find anything to evict. + + if let NeedsDownload::NotFile(ft) = reason { + return Err(DownloadError::NotFile(ft)); + } + + if let Some(ctx) = ctx { + self.check_expected_download(ctx)?; + } + + if !allow_download { + // this is only used from tests, but it is hard to test without the boolean + return Err(DownloadError::DownloadRequired); + } + + let download_ctx = ctx + .map(|ctx| ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download)) + .unwrap_or(RequestContext::new( + TaskKind::LayerDownload, + DownloadBehavior::Download, + )); + + async move { + tracing::info!(%reason, "downloading on-demand"); + + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + let res = self + .download_init_and_wait(timeline, permit, download_ctx) + .await?; + scopeguard::ScopeGuard::into_inner(init_cancelled); + Ok(res) + } + .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) + .await } /// Nag or fail per RequestContext policy @@ -837,121 +1045,200 @@ impl LayerInner { } /// Actual download, at most one is executed at the time. - async fn spawn_download_and_wait( + async fn download_init_and_wait( self: &Arc, timeline: Arc, permit: heavier_once_cell::InitPermit, - ) -> Result { - let task_name = format!("download layer {}", self); + ctx: RequestContext, + ) -> Result, DownloadError> { + debug_assert_current_span_has_tenant_and_timeline_id(); let (tx, rx) = tokio::sync::oneshot::channel(); - // this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot - // block tenant::mgr::remove_tenant_from_memory. - let this: Arc = self.clone(); - crate::task_mgr::spawn( - &tokio::runtime::Handle::current(), - crate::task_mgr::TaskKind::RemoteDownloadTask, - Some(self.desc.tenant_shard_id), - Some(self.desc.timeline_id), - &task_name, - false, + let guard = timeline + .gate + .enter() + .map_err(|_| DownloadError::DownloadCancelled)?; + + Self::spawn( async move { + let _guard = guard; - let client = timeline - .remote_client + // now that we have commited to downloading, send out an update to: + // - unhang any pending eviction + // - break out of evict_and_wait + this.status .as_ref() - .expect("checked above with have_remote_client"); + .unwrap() + .send_replace(Status::Downloading); - let result = client.download_layer_file( - &this.desc.filename(), - &this.metadata(), - &crate::task_mgr::shutdown_token() - ) - .await; + #[cfg(test)] + this.failpoint(failpoints::FailpointKind::WaitBeforeDownloading) + .await + .unwrap(); - let result = match result { - Ok(size) => { - timeline.metrics.resident_physical_size_add(size); - Ok(()) - } - Err(e) => { - let consecutive_failures = - this.consecutive_failures.fetch_add(1, Ordering::Relaxed); + let res = this.download_and_init(timeline, permit, &ctx).await; - let backoff = utils::backoff::exponential_backoff_duration_seconds( - consecutive_failures.min(u32::MAX as usize) as u32, - 1.5, - 60.0, - ); - - let backoff = std::time::Duration::from_secs_f64(backoff); - - tokio::select! { - _ = tokio::time::sleep(backoff) => {}, - _ = crate::task_mgr::shutdown_token().cancelled_owned() => {}, - _ = timeline.cancel.cancelled() => {}, - }; - - Err(e) - } - }; - - if let Err(res) = tx.send((result, permit)) { + if let Err(res) = tx.send(res) { match res { - (Ok(()), _) => { - // our caller is cancellation safe so this is fine; if someone - // else requests the layer, they'll find it already downloaded. - // - // See counter [`LayerImplMetrics::inc_init_needed_no_download`] - // - // FIXME(#6028): however, could be that we should consider marking the - // layer for eviction? alas, cannot: because only DownloadedLayer will - // handle that. - }, - (Err(e), _) => { - // our caller is cancellation safe, but we might be racing with - // another attempt to initialize. before we have cancellation - // token support: these attempts should converge regardless of - // their completion order. - tracing::error!("layer file download failed, and additionally failed to communicate this to caller: {e:?}"); + Ok(_res) => { + tracing::debug!("layer initialized, but caller has been cancelled"); + LAYER_IMPL_METRICS.inc_init_completed_without_requester(); + } + Err(e) => { + tracing::info!( + "layer file download failed, and caller has been cancelled: {e:?}" + ); LAYER_IMPL_METRICS.inc_download_failed_without_requester(); } } } - - Ok(()) } .in_current_span(), ); + match rx.await { - Ok((Ok(()), permit)) => { - if let Some(reason) = self - .needs_download() - .await - .map_err(DownloadError::PostStatFailed)? - { - // this is really a bug in needs_download or remote timeline client - panic!("post-condition failed: needs_download returned {reason:?}"); - } - - self.consecutive_failures.store(0, Ordering::Relaxed); - tracing::info!("on-demand download successful"); - - Ok(permit) - } - Ok((Err(e), _permit)) => { + Ok(Ok(res)) => Ok(res), + Ok(Err(e)) => { // sleep already happened in the spawned task, if it was not cancelled - let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed); - tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); - Err(DownloadError::DownloadFailed) + match e.downcast_ref::() { + // If the download failed due to its cancellation token, + // propagate the cancellation error upstream. + Some(remote_storage::DownloadError::Cancelled) => { + Err(DownloadError::DownloadCancelled) + } + // FIXME: this is not embedding the error because historically it would had + // been output to compute, however that is no longer the case. + _ => Err(DownloadError::DownloadFailed), + } } Err(_gone) => Err(DownloadError::DownloadCancelled), } } + async fn download_and_init( + self: &Arc, + timeline: Arc, + permit: heavier_once_cell::InitPermit, + ctx: &RequestContext, + ) -> anyhow::Result> { + let result = timeline + .remote_client + .download_layer_file( + &self.desc.layer_name(), + &self.metadata(), + &self.path, + &timeline.cancel, + ctx, + ) + .await; + + match result { + Ok(size) => { + assert_eq!(size, self.desc.file_size); + + match self.needs_download().await { + Ok(Some(reason)) => { + // this is really a bug in needs_download or remote timeline client + panic!("post-condition failed: needs_download returned {reason:?}"); + } + Ok(None) => { + // as expected + } + Err(e) => { + panic!("post-condition failed: needs_download errored: {e:?}"); + } + } + + tracing::info!(size=%self.desc.file_size, "on-demand download successful"); + timeline + .metrics + .resident_physical_size_add(self.desc.file_size); + self.consecutive_failures.store(0, Ordering::Relaxed); + + let since_last_eviction = self + .last_evicted_at + .lock() + .unwrap() + .take() + .map(|ts| ts.elapsed()); + if let Some(since_last_eviction) = since_last_eviction { + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + } + + self.access_stats.record_residence_event( + LayerResidenceStatus::Resident, + LayerResidenceEventReason::ResidenceChange, + ); + + Ok(self.initialize_after_layer_is_on_disk(permit)) + } + Err(e) => { + let consecutive_failures = + 1 + self.consecutive_failures.fetch_add(1, Ordering::Relaxed); + + if timeline.cancel.is_cancelled() { + // If we're shutting down, drop out before logging the error + return Err(e); + } + + tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); + + let backoff = utils::backoff::exponential_backoff_duration_seconds( + consecutive_failures.min(u32::MAX as usize) as u32, + 1.5, + 60.0, + ); + + let backoff = std::time::Duration::from_secs_f64(backoff); + + tokio::select! { + _ = tokio::time::sleep(backoff) => {}, + _ = timeline.cancel.cancelled() => {}, + }; + + Err(e) + } + } + } + + /// Initializes the `Self::inner` to a "resident" state. + /// + /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download` + /// before calling this method. + /// + /// If this method is ever made async, it needs to be cancellation safe so that no state + /// changes are made before we can write to the OnceCell in non-cancellable fashion. + fn initialize_after_layer_is_on_disk( + self: &Arc, + permit: heavier_once_cell::InitPermit, + ) -> Arc { + debug_assert_current_span_has_tenant_and_timeline_id(); + + // disable any scheduled but not yet running eviction deletions for this initialization + let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + self.status.as_ref().unwrap().send_replace(Status::Resident); + + let res = Arc::new(DownloadedLayer { + owner: Arc::downgrade(self), + kind: tokio::sync::OnceCell::default(), + version: next_version, + }); + + let waiters = self.inner.initializer_count(); + if waiters > 0 { + tracing::info!(waiters, "completing layer init for other tasks"); + } + + let value = ResidentOrWantedEvicted::Resident(res.clone()); + + self.inner.set(value, permit); + + res + } + async fn needs_download(&self) -> Result, std::io::Error> { match tokio::fs::metadata(&self.path).await { Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()), @@ -983,11 +1270,13 @@ impl LayerInner { } fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo { - let layer_file_name = self.desc.filename().file_name(); + let layer_name = self.desc.layer_name().to_string(); - // this is not accurate: we could have the file locally but there was a cancellation - // and now we are not in sync, or we are currently downloading it. - let remote = self.inner.get().is_none(); + let resident = self + .inner + .get() + .map(|rowe| rowe.is_likely_resident()) + .unwrap_or(false); let access_stats = self.access_stats.as_api_model(reset); @@ -995,110 +1284,201 @@ impl LayerInner { let lsn_range = &self.desc.lsn_range; HistoricLayerInfo::Delta { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn_range.start, lsn_end: lsn_range.end, - remote, + remote: !resident, access_stats, + l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()), } } else { let lsn = self.desc.image_layer_lsn(); HistoricLayerInfo::Image { - layer_file_name, + layer_file_name: layer_name, layer_file_size: self.desc.file_size, lsn_start: lsn, - remote, + remote: !resident, access_stats, } } } /// `DownloadedLayer` is being dropped, so it calls this method. - fn on_downloaded_layer_drop(self: Arc, version: usize) { - let delete = self.wanted_deleted.load(Ordering::Acquire); - let evict = self.wanted_evicted.load(Ordering::Acquire); - let can_evict = self.have_remote_client; + fn on_downloaded_layer_drop(self: Arc, only_version: usize) { + // we cannot know without inspecting LayerInner::inner if we should evict or not, even + // though here it is very likely + let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, version=%only_version); - if delete { - // do nothing now, only in LayerInner::drop -- this was originally implemented because - // we could had already scheduled the deletion at the time. - // - // FIXME: this is not true anymore, we can safely evict wanted deleted files. - } else if can_evict && evict { - let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version); + // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might + // drop while the `self.inner` is being locked, leading to a deadlock. - // downgrade for queueing, in case there's a tear down already ongoing we should not - // hold it alive. - let this = Arc::downgrade(&self); - drop(self); + let start_evicting = async move { + #[cfg(test)] + self.failpoint(failpoints::FailpointKind::WaitBeforeStartingEvicting) + .await + .expect("failpoint should not have errored"); - // NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might - // drop while the `self.inner` is being locked, leading to a deadlock. + tracing::debug!("eviction started"); - crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || { - let _g = span.entered(); + let res = self.wait_for_turn_and_evict(only_version).await; + // metrics: ignore the Ok branch, it is not done yet + if let Err(e) = res { + tracing::debug!(res=?Err::<(), _>(&e), "eviction completed"); + LAYER_IMPL_METRICS.inc_eviction_cancelled(e); + } + }; - // if LayerInner is already dropped here, do nothing because the delete on drop - // has already ran while we were in queue - let Some(this) = this.upgrade() else { - LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone); - return; - }; - match this.evict_blocking(version) { - Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(), - Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason), - } - }); - } + Self::spawn(start_evicting.instrument(span)); } - fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> { - // deleted or detached timeline, don't do anything. - let Some(timeline) = self.timeline.upgrade() else { + async fn wait_for_turn_and_evict( + self: Arc, + only_version: usize, + ) -> Result<(), EvictionCancelled> { + fn is_good_to_continue(status: &Status) -> Result<(), EvictionCancelled> { + use Status::*; + match status { + Resident => Ok(()), + Evicted => Err(EvictionCancelled::UnexpectedEvictedState), + Downloading => Err(EvictionCancelled::LostToDownload), + } + } + + let timeline = self + .timeline + .upgrade() + .ok_or(EvictionCancelled::TimelineGone)?; + + let mut rx = self + .status + .as_ref() + .expect("LayerInner cannot be dropped, holding strong ref") + .subscribe(); + + is_good_to_continue(&rx.borrow_and_update())?; + + let Ok(gate) = timeline.gate.enter() else { return Err(EvictionCancelled::TimelineGone); }; - // to avoid starting a new download while we evict, keep holding on to the - // permit. - let _permit = { - let maybe_downloaded = self.inner.get(); + let permit = { + // we cannot just `std::fs::remove_file` because there might already be an + // get_or_maybe_download which will inspect filesystem and reinitialize. filesystem + // operations must be done while holding the heavier_once_cell::InitPermit + let mut wait = std::pin::pin!(self.inner.get_or_init_detached()); - let (_weak, permit) = match maybe_downloaded { - Some(mut guard) => { - if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard { - if *version == only_version { - guard.take_and_deinit() - } else { - // this was not for us; maybe there's another eviction job - // TODO: does it make any sense to stall here? unique versions do not - // matter, we only want to make sure not to evict a resident, which we - // are not doing. - return Err(EvictionCancelled::VersionCheckFailed); - } - } else { - return Err(EvictionCancelled::AlreadyReinitialized); + let waited = loop { + // we must race to the Downloading starting, otherwise we would have to wait until the + // completion of the download. waiting for download could be long and hinder our + // efforts to alert on "hanging" evictions. + tokio::select! { + res = &mut wait => break res, + _ = rx.changed() => { + is_good_to_continue(&rx.borrow_and_update())?; + // two possibilities for Status::Resident: + // - the layer was found locally from disk by a read + // - we missed a bunch of updates and now the layer is + // again downloaded -- assume we'll fail later on with + // version check or AlreadyReinitialized } } - None => { - // already deinitialized, perhaps get_or_maybe_download did this and is - // currently waiting to reinitialize it - return Err(EvictionCancelled::LostToDownload); + }; + + // re-check now that we have the guard or permit; all updates should have happened + // while holding the permit. + is_good_to_continue(&rx.borrow_and_update())?; + + // the term deinitialize is used here, because we clearing out the Weak will eventually + // lead to deallocating the reference counted value, and the value we + // `Guard::take_and_deinit` is likely to be the last because the Weak is never cloned. + let (_weak, permit) = match waited { + Ok(guard) => { + match &*guard { + ResidentOrWantedEvicted::WantedEvicted(_weak, version) + if *version == only_version => + { + tracing::debug!(version, "deinitializing matching WantedEvicted"); + let (weak, permit) = guard.take_and_deinit(); + (Some(weak), permit) + } + ResidentOrWantedEvicted::WantedEvicted(_, version) => { + // if we were not doing the version check, we would need to try to + // upgrade the weak here to see if it really is dropped. version check + // is done instead assuming that it is cheaper. + tracing::debug!( + version, + only_version, + "version mismatch, not deinitializing" + ); + return Err(EvictionCancelled::VersionCheckFailed); + } + ResidentOrWantedEvicted::Resident(_) => { + return Err(EvictionCancelled::AlreadyReinitialized); + } + } + } + Err(permit) => { + tracing::debug!("continuing after cancelled get_or_maybe_download or eviction"); + (None, permit) } }; permit }; - // now accesses to inner.get_or_init wait on the semaphore or the `_permit` + let span = tracing::Span::current(); - self.access_stats.record_residence_event( - LayerResidenceStatus::Evicted, - LayerResidenceEventReason::ResidenceChange, - ); + let spawned_at = std::time::Instant::now(); - let res = match capture_mtime_and_remove(&self.path) { + // this is on purpose a detached spawn; we don't need to wait for it + // + // eviction completion reporting is the only thing hinging on this, and it can be just as + // well from a spawn_blocking thread. + // + // important to note that now that we've acquired the permit we have made sure the evicted + // file is either the exact `WantedEvicted` we wanted to evict, or uninitialized in case + // there are multiple evictions. The rest is not cancellable, and we've now commited to + // evicting. + // + // If spawn_blocking has a queue and maximum number of threads are in use, we could stall + // reads. We will need to add cancellation for that if necessary. + Self::spawn_blocking(move || { + let _span = span.entered(); + + let res = self.evict_blocking(&timeline, &gate, &permit); + + let waiters = self.inner.initializer_count(); + + if waiters > 0 { + LAYER_IMPL_METRICS.inc_evicted_with_waiters(); + } + + let completed_in = spawned_at.elapsed(); + LAYER_IMPL_METRICS.record_time_to_evict(completed_in); + + match res { + Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(), + Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e), + } + + tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed"); + }); + + Ok(()) + } + + /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs. + fn evict_blocking( + &self, + timeline: &Timeline, + _gate: &gate::GateGuard, + _permit: &heavier_once_cell::InitPermit, + ) -> Result<(), EvictionCancelled> { + // now accesses to `self.inner.get_or_init*` wait on the semaphore or the `_permit` + + match capture_mtime_and_remove(&self.path) { Ok(local_layer_mtime) => { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { @@ -1122,33 +1502,60 @@ impl LayerInner { timeline .metrics .resident_physical_size_sub(self.desc.file_size); - - Ok(()) } Err(e) if e.kind() == std::io::ErrorKind::NotFound => { tracing::error!( layer_size = %self.desc.file_size, - "failed to evict layer from disk, it was already gone (metrics will be inaccurate)" + "failed to evict layer from disk, it was already gone" ); - Err(EvictionCancelled::FileNotFound) + return Err(EvictionCancelled::FileNotFound); } Err(e) => { + // FIXME: this should probably be an abort tracing::error!("failed to evict file from disk: {e:#}"); - Err(EvictionCancelled::RemoveFailed) + return Err(EvictionCancelled::RemoveFailed); } - }; + } - // we are still holding the permit, so no new spawn_download_and_wait can happen - drop(self.status.send(Status::Evicted)); + self.access_stats.record_residence_event( + LayerResidenceStatus::Evicted, + LayerResidenceEventReason::ResidenceChange, + ); + + self.status.as_ref().unwrap().send_replace(Status::Evicted); *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now()); - res + Ok(()) } fn metadata(&self) -> LayerFileMetadata { LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard) } + + /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. + /// + /// Synchronizing with spawned tasks is very complicated otherwise. + fn spawn(fut: F) + where + F: std::future::Future + Send + 'static, + { + #[cfg(test)] + tokio::task::spawn(fut); + #[cfg(not(test))] + crate::task_mgr::BACKGROUND_RUNTIME.spawn(fut); + } + + /// Needed to use entered runtime in tests, but otherwise use BACKGROUND_RUNTIME. + fn spawn_blocking(f: F) + where + F: FnOnce() + Send + 'static, + { + #[cfg(test)] + tokio::task::spawn_blocking(f); + #[cfg(not(test))] + crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(f); + } } fn capture_mtime_and_remove(path: &Utf8Path) -> Result { @@ -1166,15 +1573,16 @@ pub(crate) enum EvictionError { /// Evictions must always lose to downloads in races, and this time it happened. #[error("layer was downloaded instead")] Downloaded, + + #[error("eviction did not happen within timeout")] + Timeout, } /// Error internal to the [`LayerInner::get_or_maybe_download`] #[derive(Debug, thiserror::Error)] -enum DownloadError { +pub(crate) enum DownloadError { #[error("timeline has already shutdown")] TimelineShutdown, - #[error("no remote storage configured")] - NoRemoteStorage, #[error("context denies downloading")] ContextAndConfigReallyDeniesDownloads, #[error("downloading is really required but not allowed by this method")] @@ -1189,8 +1597,10 @@ enum DownloadError { DownloadCancelled, #[error("pre-condition: stat before download failed")] PreStatFailed(#[source] std::io::Error), - #[error("post-condition: stat after download failed")] - PostStatFailed(#[source] std::io::Error), + + #[cfg(test)] + #[error("failpoint: {0:?}")] + Failpoint(failpoints::FailpointKind), } #[derive(Debug, PartialEq)] @@ -1236,7 +1646,8 @@ impl Drop for DownloadedLayer { if let Some(owner) = self.owner.upgrade() { owner.on_downloaded_layer_drop(self.version); } else { - // no need to do anything, we are shutting down + // Layer::drop will handle cancelling the eviction; because of drop order and + // `DownloadedLayer` never leaking, we cannot know here if eviction was requested. } } } @@ -1267,9 +1678,14 @@ impl DownloadedLayer { owner.desc.key_range.clone(), owner.desc.lsn_range.clone(), )); - delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx) - .await - .map(|res| res.map(LayerKind::Delta)) + delta_layer::DeltaLayerInner::load( + &owner.path, + summary, + Some(owner.conf.max_vectored_read_bytes), + ctx, + ) + .await + .map(|res| res.map(LayerKind::Delta)) } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( @@ -1278,9 +1694,15 @@ impl DownloadedLayer { owner.desc.key_range.clone(), lsn, )); - image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx) - .await - .map(|res| res.map(LayerKind::Image)) + image_layer::ImageLayerInner::load( + &owner.path, + lsn, + summary, + Some(owner.conf.max_vectored_read_bytes), + ctx, + ) + .await + .map(|res| res.map(LayerKind::Image)) }; match res { @@ -1330,6 +1752,42 @@ impl DownloadedLayer { } } + async fn get_values_reconstruct_data( + &self, + keyspace: KeySpace, + lsn_range: Range, + reconstruct_data: &mut ValuesReconstructState, + owner: &Arc, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + use LayerKind::*; + + match self.get(owner, ctx).await.map_err(GetVectoredError::from)? { + Delta(d) => { + d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) + .await + } + Image(i) => { + i.get_values_reconstruct_data(keyspace, reconstruct_data, ctx) + .await + } + } + } + + #[cfg(test)] + async fn load_key_values( + &self, + owner: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result> { + use LayerKind::*; + + match self.get(owner, ctx).await? { + Delta(d) => d.load_key_values(ctx).await, + Image(i) => i.load_key_values(ctx).await, + } + } + async fn dump(&self, owner: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { use LayerKind::*; match self.get(owner, ctx).await? { @@ -1376,7 +1834,7 @@ impl ResidentLayer { } /// Loads all keys stored in the layer. Returns key, lsn and value size. - #[tracing::instrument(skip_all, fields(layer=%self))] + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] pub(crate) async fn load_keys<'a>( &'a self, ctx: &RequestContext, @@ -1384,21 +1842,57 @@ impl ResidentLayer { use LayerKind::*; let owner = &self.owner.0; - match self.downloaded.get(owner, ctx).await? { Delta(ref d) => { + // this is valid because the DownloadedLayer::kind is a OnceCell, not a + // Mutex, so we cannot go and deinitialize the value with OnceCell::take + // while it's being held. owner .access_stats .record_access(LayerAccessKind::KeyIter, ctx); - // this is valid because the DownloadedLayer::kind is a OnceCell, not a - // Mutex, so we cannot go and deinitialize the value with OnceCell::take - // while it's being held. delta_layer::DeltaLayerInner::load_keys(d, ctx) .await - .context("Layer index is corrupted") + .with_context(|| format!("Layer index is corrupted for {self}")) } - Image(_) => anyhow::bail!("cannot load_keys on a image layer"), + Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")), + } + } + + /// Read all they keys in this layer which match the ShardIdentity, and write them all to + /// the provided writer. Return the number of keys written. + #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(layer=%self))] + pub(crate) async fn filter<'a>( + &'a self, + shard_identity: &ShardIdentity, + writer: &mut ImageLayerWriter, + ctx: &RequestContext, + ) -> anyhow::Result { + use LayerKind::*; + + match self.downloaded.get(&self.owner.0, ctx).await? { + Delta(_) => anyhow::bail!(format!("cannot filter() on a delta layer {self}")), + Image(i) => i.filter(shard_identity, writer, ctx).await, + } + } + + /// Returns the amount of keys and values written to the writer. + pub(crate) async fn copy_delta_prefix( + &self, + writer: &mut super::delta_layer::DeltaLayerWriter, + until: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result { + use LayerKind::*; + + let owner = &self.owner.0; + + match self.downloaded.get(owner, ctx).await? { + Delta(ref d) => d + .copy_prefix(writer, until, ctx) + .await + .with_context(|| format!("copy_delta_prefix until {until} of {self}")), + Image(_) => anyhow::bail!(format!("cannot copy_lsn_prefix of image layer {self}")), } } @@ -1406,13 +1900,21 @@ impl ResidentLayer { &self.owner.0.path } - pub(crate) fn access_stats(&self) -> &LayerAccessStats { - self.owner.access_stats() - } - pub(crate) fn metadata(&self) -> LayerFileMetadata { self.owner.metadata() } + + #[cfg(test)] + pub(crate) async fn as_delta( + &self, + ctx: &RequestContext, + ) -> anyhow::Result<&delta_layer::DeltaLayerInner> { + use LayerKind::*; + match self.downloaded.get(&self.owner.0, ctx).await? { + Delta(ref d) => Ok(d), + Image(_) => Err(anyhow::anyhow!("image layer")), + } + } } impl AsLayerDesc for ResidentLayer { @@ -1448,6 +1950,7 @@ pub(crate) struct LayerImplMetrics { rare_counters: enum_map::EnumMap, inits_cancelled: metrics::core::GenericCounter, redownload_after: metrics::Histogram, + time_to_evict: metrics::Histogram, } impl Default for LayerImplMetrics { @@ -1543,6 +2046,13 @@ impl Default for LayerImplMetrics { .unwrap() }; + let time_to_evict = metrics::register_histogram!( + "pageserver_layer_eviction_held_permit_seconds", + "Time eviction held the permit.", + vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000] + ) + .unwrap(); + Self { started_evictions, completed_evictions, @@ -1555,6 +2065,7 @@ impl Default for LayerImplMetrics { rare_counters, inits_cancelled, redownload_after, + time_to_evict, } } } @@ -1586,9 +2097,10 @@ impl LayerImplMetrics { self.rare_counters[RareEvent::RemoveOnDropFailed].inc(); } - /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`. - fn inc_retried_get_or_maybe_download(&self) { - self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc(); + /// Expected rare just as cancellations are rare, but we could have cancellations separate from + /// the single caller which can start the download, so use this counter to separte them. + fn inc_init_completed_without_requester(&self) { + self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc(); } /// Expected rare because cancellations are unexpected, and failures are unexpected @@ -1615,10 +2127,6 @@ impl LayerImplMetrics { self.rare_counters[RareEvent::PermanentLoadingFailure].inc(); } - fn inc_broadcast_lagged(&self) { - self.rare_counters[RareEvent::EvictAndWaitLagged].inc(); - } - fn inc_init_cancelled(&self) { self.inits_cancelled.inc() } @@ -1626,9 +2134,22 @@ impl LayerImplMetrics { fn record_redownloaded_after(&self, duration: std::time::Duration) { self.redownload_after.observe(duration.as_secs_f64()) } + + /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably + /// instead cancel eviction if we would have read waiters. We cannot however separate reads + /// from other evictions, so this could have noise as well. + fn inc_evicted_with_waiters(&self) { + self.rare_counters[RareEvent::EvictedWithWaiters].inc(); + } + + /// Recorded at least initially as the permit is now acquired in async context before + /// spawn_blocking action. + fn record_time_to_evict(&self, duration: std::time::Duration) { + self.time_to_evict.observe(duration.as_secs_f64()) + } } -#[derive(enum_map::Enum)] +#[derive(Debug, Clone, Copy, enum_map::Enum)] enum EvictionCancelled { LayerGone, TimelineGone, @@ -1640,6 +2161,7 @@ enum EvictionCancelled { LostToDownload, /// After eviction, there was a new layer access which cancelled the eviction. UpgradedBackOnAccess, + UnexpectedEvictedState, } impl EvictionCancelled { @@ -1653,6 +2175,7 @@ impl EvictionCancelled { EvictionCancelled::AlreadyReinitialized => "already_reinitialized", EvictionCancelled::LostToDownload => "lost_to_download", EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access", + EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state", } } } @@ -1675,12 +2198,12 @@ impl DeleteFailed { #[derive(enum_map::Enum)] enum RareEvent { RemoveOnDropFailed, - RetriedGetOrMaybeDownload, + InitCompletedWithoutRequester, DownloadFailedWithoutRequester, UpgradedWantedEvicted, InitWithoutDownload, PermanentLoadingFailure, - EvictAndWaitLagged, + EvictedWithWaiters, } impl RareEvent { @@ -1689,12 +2212,12 @@ impl RareEvent { match self { RemoveOnDropFailed => "remove_on_drop_failed", - RetriedGetOrMaybeDownload => "retried_gomd", + InitCompletedWithoutRequester => "init_completed_without", DownloadFailedWithoutRequester => "download_failed_without", UpgradedWantedEvicted => "raced_wanted_evicted", InitWithoutDownload => "init_needed_no_download", PermanentLoadingFailure => "permanent_loading_failure", - EvictAndWaitLagged => "broadcast_lagged", + EvictedWithWaiters => "evicted_with_waiters", } } } diff --git a/pageserver/src/tenant/storage_layer/layer/failpoints.rs b/pageserver/src/tenant/storage_layer/layer/failpoints.rs new file mode 100644 index 0000000000..6cedc41d98 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/layer/failpoints.rs @@ -0,0 +1,119 @@ +//! failpoints for unit tests, implying `#[cfg(test)]`. +//! +//! These are not accessible over http. + +use super::*; + +impl Layer { + /// Enable a failpoint from a unit test. + pub(super) fn enable_failpoint(&self, failpoint: Failpoint) { + self.0.failpoints.lock().unwrap().push(failpoint); + } +} + +impl LayerInner { + /// Query if this failpoint is enabled, as in, arrive at a failpoint. + /// + /// Calls to this method need to be `#[cfg(test)]` guarded. + pub(super) async fn failpoint(&self, kind: FailpointKind) -> Result<(), FailpointHit> { + let fut = { + let mut fps = self.failpoints.lock().unwrap(); + // find the *last* failpoint for cases in which we need to use multiple for the same + // thing (two blocked evictions) + let fp = fps.iter_mut().rfind(|x| x.kind() == kind); + + let Some(fp) = fp else { + return Ok(()); + }; + + fp.hit() + }; + + fut.await + } +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum FailpointKind { + /// Failpoint acts as an accurate cancelled by drop here; see the only site of use. + AfterDeterminingLayerNeedsNoDownload, + /// Failpoint for stalling eviction starting + WaitBeforeStartingEvicting, + /// Failpoint hit in the spawned task + WaitBeforeDownloading, +} + +pub(crate) enum Failpoint { + AfterDeterminingLayerNeedsNoDownload, + WaitBeforeStartingEvicting( + Option, + utils::completion::Barrier, + ), + WaitBeforeDownloading( + Option, + utils::completion::Barrier, + ), +} + +impl Failpoint { + fn kind(&self) -> FailpointKind { + match self { + Failpoint::AfterDeterminingLayerNeedsNoDownload => { + FailpointKind::AfterDeterminingLayerNeedsNoDownload + } + Failpoint::WaitBeforeStartingEvicting(..) => FailpointKind::WaitBeforeStartingEvicting, + Failpoint::WaitBeforeDownloading(..) => FailpointKind::WaitBeforeDownloading, + } + } + + fn hit(&mut self) -> impl std::future::Future> + 'static { + use futures::future::FutureExt; + + // use boxed futures to avoid Either hurdles + match self { + Failpoint::AfterDeterminingLayerNeedsNoDownload => { + let kind = self.kind(); + + async move { Err(FailpointHit(kind)) }.boxed() + } + Failpoint::WaitBeforeStartingEvicting(arrival, b) + | Failpoint::WaitBeforeDownloading(arrival, b) => { + // first one signals arrival + drop(arrival.take()); + + let b = b.clone(); + + async move { + tracing::trace!("waiting on a failpoint barrier"); + b.wait().await; + tracing::trace!("done waiting on a failpoint barrier"); + Ok(()) + } + .boxed() + } + } + } +} + +impl std::fmt::Display for FailpointKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(self, f) + } +} + +#[derive(Debug)] +pub(crate) struct FailpointHit(FailpointKind); + +impl std::fmt::Display for FailpointHit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(self, f) + } +} + +impl std::error::Error for FailpointHit {} + +impl From for DownloadError { + fn from(value: FailpointHit) -> Self { + DownloadError::Failpoint(value.0) + } +} diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs new file mode 100644 index 0000000000..3a7aca7a6c --- /dev/null +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -0,0 +1,961 @@ +use pageserver_api::key::CONTROLFILE_KEY; +use tokio::task::JoinSet; +use utils::{ + completion::{self, Completion}, + id::TimelineId, +}; + +use super::failpoints::{Failpoint, FailpointKind}; +use super::*; +use crate::context::DownloadBehavior; +use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness}; + +/// Used in tests to advance a future to wanted await point, and not futher. +const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600); + +/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE +/// timeout uses to advance futures. +const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7); + +/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions. +#[tokio::test] +async fn smoke_test() { + let handle = tokio::runtime::Handle::current(); + + let h = TenantHarness::create("smoke_test").unwrap(); + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + let (tenant, _) = h.load().await; + + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // all layers created at pageserver are like `layer`, initialized with strong + // Arc. + + let img_before = { + let mut data = ValueReconstructState::default(); + layer + .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .await + .unwrap(); + data.img + .take() + .expect("tenant harness writes the control file") + }; + + // important part is evicting the layer, which can be done when there are no more ResidentLayer + // instances -- there currently are none, only two `Layer` values, one in the layermap and on + // in scope. + layer.evict_and_wait(FOREVER).await.unwrap(); + + // double-evict returns an error, which is valid if both eviction_task and disk usage based + // eviction would both evict the same layer at the same time. + + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound)); + + // on accesses when the layer is evicted, it will automatically be downloaded. + let img_after = { + let mut data = ValueReconstructState::default(); + layer + .get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx) + .instrument(download_span.clone()) + .await + .unwrap(); + data.img.take().unwrap() + }; + + assert_eq!(img_before, img_after); + + // evict_and_wait can timeout, but it doesn't cancel the evicting itself + // + // ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to + // artificially slow it down. + let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(&handle).await; + + match layer + .evict_and_wait(std::time::Duration::ZERO) + .await + .unwrap_err() + { + EvictionError::Timeout => { + // expected, but note that the eviction is "still ongoing" + helper.release().await; + // exhaust spawn_blocking pool to ensure it is now complete + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle) + .await; + } + other => unreachable!("{other:?}"), + } + + // only way to query if a layer is resident is to acquire a ResidentLayer instance. + // Layer::keep_resident never downloads, but it might initialize if the layer file is found + // downloaded locally. + let none = layer.keep_resident().await; + assert!( + none.is_none(), + "Expected none, because eviction removed the local file, found: {none:?}" + ); + + // plain downloading is rarely needed + layer + .download_and_keep_resident() + .instrument(download_span) + .await + .unwrap(); + + // last important part is deletion on drop: gc and compaction use it for compacted L0 layers + // or fully garbage collected layers. deletion means deleting the local file, and scheduling a + // deletion of the already unlinked from index_part.json remote file. + // + // marking a layer to be deleted on drop is irreversible; there is no technical reason against + // reversiblity, but currently it is not needed so it is not provided. + layer.delete_on_drop(); + + let path = layer.local_path().to_owned(); + + // wait_drop produces an unconnected to Layer future which will resolve when the + // LayerInner::drop has completed. + let mut wait_drop = std::pin::pin!(layer.wait_drop()); + + // paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing + // until here + tokio::time::pause(); + tokio::time::timeout(ADVANCE, &mut wait_drop) + .await + .expect_err("should had timed out because two strong references exist"); + + tokio::fs::metadata(&path) + .await + .expect("the local layer file still exists"); + + let rtc = &timeline.remote_client; + + { + let layers = &[layer]; + let mut g = timeline.layers.write().await; + g.finish_gc_timeline(layers); + // this just updates the remote_physical_size for demonstration purposes + rtc.schedule_gc_update(layers).unwrap(); + } + + // when strong references are dropped, the file is deleted and remote deletion is scheduled + wait_drop.await; + + let e = tokio::fs::metadata(&path) + .await + .expect_err("the local file is deleted"); + assert_eq!(e.kind(), std::io::ErrorKind::NotFound); + + rtc.wait_completion().await.unwrap(); + + assert_eq!(rtc.get_remote_physical_size(), 0); + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) +} + +/// This test demonstrates a previous hang when a eviction and deletion were requested at the same +/// time. Now both of them complete per Arc drop semantics. +#[tokio::test(start_paused = true)] +async fn evict_and_wait_on_wanted_deleted() { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + + let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap(); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + { + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + + layer.delete_on_drop(); + + drop(resident); + + // make sure the eviction task gets to run + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + let resident = layer.keep_resident().await; + assert!( + resident.is_none(), + "keep_resident should not have re-initialized: {resident:?}" + ); + + evict_and_wait + .await + .expect("evict_and_wait should had succeeded"); + + // works as intended + } + + // assert that once we remove the `layer` from the layer map and drop our reference, + // the deletion of the layer in remote_storage happens. + { + let mut layers = timeline.layers.write().await; + layers.finish_gc_timeline(&[layer]); + } + + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get()); + assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get()); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) +} + +/// This test ensures we are able to read the layer while the layer eviction has been +/// started but not completed. +#[test] +fn read_wins_pending_eviction() { + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .start_paused(true) + .build() + .unwrap(); + + rt.block_on(async move { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create("read_wins_pending_eviction").unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + let (completion, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + // now the eviction cannot proceed because the threads are consumed while completion exists + drop(resident); + arrived_at_barrier.wait().await; + assert!(!layer.is_likely_resident()); + + // because no actual eviction happened, we get to just reinitialize the DownloadedLayer + layer + .0 + .get_or_maybe_download(false, None) + .instrument(download_span) + .await + .expect("should had reinitialized without downloading"); + + assert!(layer.is_likely_resident()); + + // reinitialization notifies of new resident status, which should error out all evict_and_wait + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect("no timeout, because get_or_maybe_download re-initialized") + .expect_err("eviction should not have succeeded because re-initialized"); + + // works as intended: evictions lose to "downloads" + assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // this is not wrong: the eviction is technically still "on the way" as it's still queued + // because of a failpoint + assert_eq!( + 0, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + drop(completion); + + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) + .await; + + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // now we finally can observe the original eviction failing + // it would had been possible to observe it earlier, but here it is guaranteed to have + // happened. + assert_eq!( + 1, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) + }); +} + +/// Use failpoint to delay an eviction starting to get a VersionCheckFailed. +#[test] +fn multiple_pending_evictions_in_order() { + let name = "multiple_pending_evictions_in_order"; + let in_order = true; + multiple_pending_evictions_scenario(name, in_order); +} + +/// Use failpoint to reorder later eviction before first to get a UnexpectedEvictedState. +#[test] +fn multiple_pending_evictions_out_of_order() { + let name = "multiple_pending_evictions_out_of_order"; + let in_order = false; + multiple_pending_evictions_scenario(name, in_order); +} + +fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .start_paused(true) + .build() + .unwrap(); + + rt.block_on(async move { + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create(name).unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // setup done + + let resident = layer.keep_resident().await.unwrap(); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // drive the future to await on the status channel + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + let (completion1, barrier) = utils::completion::channel(); + let mut completion1 = Some(completion1); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + // now the eviction cannot proceed because we are simulating arbitrary long delay for the + // eviction task start. + drop(resident); + assert!(!layer.is_likely_resident()); + + arrived_at_barrier.wait().await; + + // because no actual eviction happened, we get to just reinitialize the DownloadedLayer + layer + .0 + .get_or_maybe_download(false, None) + .instrument(download_span) + .await + .expect("should had reinitialized without downloading"); + + assert!(layer.is_likely_resident()); + + // reinitialization notifies of new resident status, which should error out all evict_and_wait + let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect("no timeout, because get_or_maybe_download re-initialized") + .expect_err("eviction should not have succeeded because re-initialized"); + + // works as intended: evictions lose to "downloads" + assert!(matches!(e, EvictionError::Downloaded), "{e:?}"); + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // this is not wrong: the eviction is technically still "on the way" as it's still queued + // because of a failpoint + assert_eq!( + 0, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get()); + + // configure another failpoint for the second eviction -- evictions are per initialization, + // so now that we've reinitialized the inner, we get to run two of them at the same time. + let (completion2, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // advance to the wait on the queue + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect_err("timeout because failpoint is blocking"); + + arrived_at_barrier.wait().await; + + assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get()); + + let mut release_earlier_eviction = |expected_reason| { + assert_eq!( + 0, + LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), + ); + + drop(completion1.take().unwrap()); + + let handle = &handle; + + async move { + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0( + handle, 1, + ) + .await; + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(), + ); + } + }; + + if in_order { + release_earlier_eviction(EvictionCancelled::VersionCheckFailed).await; + } + + // release the later eviction which is for the current version + drop(completion2); + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1) + .await; + + if !in_order { + release_earlier_eviction(EvictionCancelled::UnexpectedEvictedState).await; + } + + tokio::time::timeout(ADVANCE, &mut second_eviction) + .await + .expect("eviction goes through now that spawn_blocking is unclogged") + .expect("eviction should succeed, because version matches"); + + assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get()); + + // ensure the cancelled are unchanged + assert_eq!( + 1, + LAYER_IMPL_METRICS + .cancelled_evictions + .values() + .map(|ctr| ctr.get()) + .sum::() + ); + + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) + }); +} + +/// The test ensures with a failpoint that a pending eviction is not cancelled by what is currently +/// a `Layer::keep_resident` call. +/// +/// This matters because cancelling the eviction would leave us in a state where the file is on +/// disk but the layer internal state says it has not been initialized. Futhermore, it allows us to +/// have non-repairing `Layer::is_likely_resident`. +#[tokio::test(start_paused = true)] +async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { + let handle = tokio::runtime::Handle::current(); + let h = + TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // this failpoint will simulate the `get_or_maybe_download` becoming cancelled (by returning an + // Err) at the right time as in "during" the `LayerInner::needs_download`. + layer.enable_failpoint(Failpoint::AfterDeterminingLayerNeedsNoDownload); + + let (completion, barrier) = utils::completion::channel(); + let (arrival, arrived_at_barrier) = utils::completion::channel(); + + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + tokio::time::timeout(ADVANCE, layer.evict_and_wait(FOREVER)) + .await + .expect_err("should had advanced to waiting on channel"); + + arrived_at_barrier.wait().await; + + // simulate a cancelled read which is cancelled before it gets to re-initialize + let e = layer + .0 + .get_or_maybe_download(false, None) + .await + .unwrap_err(); + assert!( + matches!( + e, + DownloadError::Failpoint(FailpointKind::AfterDeterminingLayerNeedsNoDownload) + ), + "{e:?}" + ); + + assert!( + layer.0.needs_download().await.unwrap().is_none(), + "file is still on disk" + ); + + // release the eviction task + drop(completion); + tokio::time::sleep(ADVANCE).await; + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + // failpoint is still enabled, but it is not hit + let e = layer + .0 + .get_or_maybe_download(false, None) + .await + .unwrap_err(); + assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}"); + + // failpoint is not counted as cancellation either + assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get()) +} + +#[tokio::test(start_paused = true)] +async fn evict_and_wait_does_not_wait_for_download() { + // let handle = tokio::runtime::Handle::current(); + let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap(); + let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + let layer = { + let mut layers = { + let layers = timeline.layers.read().await; + layers.likely_resident_layers().collect::>() + }; + + assert_eq!(layers.len(), 1); + + layers.swap_remove(0) + }; + + // kind of forced setup: start an eviction but do not allow it progress until we are + // downloading + let (eviction_can_continue, barrier) = utils::completion::channel(); + let (arrival, eviction_arrived) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeStartingEvicting( + Some(arrival), + barrier, + )); + + let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER)); + + // use this once-awaited other_evict to synchronize with the eviction + let other_evict = layer.evict_and_wait(FOREVER); + + tokio::time::timeout(ADVANCE, &mut evict_and_wait) + .await + .expect_err("should had advanced"); + eviction_arrived.wait().await; + drop(eviction_can_continue); + other_evict.await.unwrap(); + + // now the layer is evicted, and the "evict_and_wait" is waiting on the receiver + assert!(!layer.is_likely_resident()); + + // following new evict_and_wait will fail until we've completed the download + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound), "{e:?}"); + + let (download_can_continue, barrier) = utils::completion::channel(); + let (arrival, _download_arrived) = utils::completion::channel(); + layer.enable_failpoint(Failpoint::WaitBeforeDownloading(Some(arrival), barrier)); + + let mut download = std::pin::pin!(layer + .0 + .get_or_maybe_download(true, None) + .instrument(download_span)); + + assert!( + !layer.is_likely_resident(), + "during download layer is evicted" + ); + + tokio::time::timeout(ADVANCE, &mut download) + .await + .expect_err("should had timed out because of failpoint"); + + // now we finally get to continue, and because the latest state is downloading, we deduce that + // original eviction succeeded + evict_and_wait.await.unwrap(); + + // however a new evict_and_wait will fail + let e = layer.evict_and_wait(FOREVER).await.unwrap_err(); + assert!(matches!(e, EvictionError::NotFound), "{e:?}"); + + assert!(!layer.is_likely_resident()); + + drop(download_can_continue); + download.await.expect("download should had succeeded"); + assert!(layer.is_likely_resident()); + + // only now can we evict + layer.evict_and_wait(FOREVER).await.unwrap(); +} + +/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident, +/// which is the last value. +/// +/// Also checks that the same does not happen on a non-evicted layer (regression test). +#[tokio::test(start_paused = true)] +async fn eviction_cancellation_on_drop() { + use crate::repository::Value; + use bytes::Bytes; + + // this is the runtime on which Layer spawns the blocking tasks on + let handle = tokio::runtime::Handle::current(); + + let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap(); + utils::logging::replace_panic_hook_with_tracing_panic_hook().forget(); + let (tenant, ctx) = h.load().await; + + let timeline = tenant + .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .await + .unwrap(); + + { + // create_test_timeline wrote us one layer, write another + let mut writer = timeline.writer().await; + writer + .put( + Key::from_i128(5), + Lsn(0x20), + &Value::Image(Bytes::from_static(b"this does not matter either")), + &ctx, + ) + .await + .unwrap(); + + writer.finish_write(Lsn(0x20)); + } + + timeline.freeze_and_flush().await.unwrap(); + + // wait for the upload to complete so our Arc::strong_count assertion holds + timeline.remote_client.wait_completion().await.unwrap(); + + let (evicted_layer, not_evicted) = { + let mut layers = { + let mut guard = timeline.layers.write().await; + let layers = guard.likely_resident_layers().collect::>(); + // remove the layers from layermap + guard.finish_gc_timeline(&layers); + + layers + }; + + assert_eq!(layers.len(), 2); + + (layers.pop().unwrap(), layers.pop().unwrap()) + }; + + let victims = [(evicted_layer, true), (not_evicted, false)]; + + for (victim, evict) in victims { + let resident = victim.keep_resident().await.unwrap(); + drop(victim); + + assert_eq!(Arc::strong_count(&resident.owner.0), 1); + + if evict { + let evict_and_wait = resident.owner.evict_and_wait(FOREVER); + + // drive the future to await on the status channel, and then drop it + tokio::time::timeout(ADVANCE, evict_and_wait) + .await + .expect_err("should had been a timeout since we are holding the layer resident"); + } + + // 1 == we only evict one of the layers + assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get()); + + drop(resident); + + // run any spawned + tokio::time::sleep(ADVANCE).await; + + SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await; + + assert_eq!( + 1, + LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get() + ); + } +} + +/// A test case to remind you the cost of these structures. You can bump the size limit +/// below if it is really necessary to add more fields to the structures. +#[test] +#[cfg(target_arch = "x86_64")] +fn layer_size() { + assert_eq!(std::mem::size_of::(), 2040); + assert_eq!(std::mem::size_of::(), 104); + assert_eq!(std::mem::size_of::(), 2344); + // it also has the utf8 path +} + +struct SpawnBlockingPoolHelper { + awaited_by_spawn_blocking_tasks: Completion, + blocking_tasks: JoinSet<()>, +} + +impl SpawnBlockingPoolHelper { + /// All `crate::task_mgr::BACKGROUND_RUNTIME` spawn_blocking threads will be consumed until + /// release is called. + /// + /// In the tests this can be used to ensure something cannot be started on the target runtimes + /// spawn_blocking pool. + /// + /// This should be no issue nowdays, because nextest runs each test in it's own process. + async fn consume_all_spawn_blocking_threads(handle: &tokio::runtime::Handle) -> Self { + let default_max_blocking_threads = 512; + + Self::consume_all_spawn_blocking_threads0(handle, default_max_blocking_threads).await + } + + async fn consume_all_spawn_blocking_threads0( + handle: &tokio::runtime::Handle, + threads: usize, + ) -> Self { + assert_ne!(threads, 0); + + let (completion, barrier) = completion::channel(); + let (started, starts_completed) = completion::channel(); + + let mut blocking_tasks = JoinSet::new(); + + for _ in 0..threads { + let barrier = barrier.clone(); + let started = started.clone(); + blocking_tasks.spawn_blocking_on( + move || { + drop(started); + tokio::runtime::Handle::current().block_on(barrier.wait()); + }, + handle, + ); + } + + drop(started); + + starts_completed.wait().await; + + drop(barrier); + + tracing::trace!("consumed all threads"); + + SpawnBlockingPoolHelper { + awaited_by_spawn_blocking_tasks: completion, + blocking_tasks, + } + } + + /// Release all previously blocked spawn_blocking threads + async fn release(self) { + let SpawnBlockingPoolHelper { + awaited_by_spawn_blocking_tasks, + mut blocking_tasks, + } = self; + + drop(awaited_by_spawn_blocking_tasks); + + while let Some(res) = blocking_tasks.join_next().await { + res.expect("none of the tasks should had panicked"); + } + + tracing::trace!("released all threads"); + } + + /// In the tests it is used as an easy way of making sure something scheduled on the target + /// runtimes `spawn_blocking` has completed, because it must've been scheduled and completed + /// before our tasks have a chance to schedule and complete. + async fn consume_and_release_all_of_spawn_blocking_threads(handle: &tokio::runtime::Handle) { + Self::consume_and_release_all_of_spawn_blocking_threads0(handle, 512).await + } + + async fn consume_and_release_all_of_spawn_blocking_threads0( + handle: &tokio::runtime::Handle, + threads: usize, + ) { + Self::consume_all_spawn_blocking_threads0(handle, threads) + .await + .release() + .await + } +} + +#[test] +fn spawn_blocking_pool_helper_actually_works() { + // create a custom runtime for which we know and control how many blocking threads it has + // + // because the amount is not configurable for our helper, expect the same amount as + // BACKGROUND_RUNTIME using the tokio defaults would have. + let rt = tokio::runtime::Builder::new_current_thread() + .max_blocking_threads(1) + .enable_all() + .build() + .unwrap(); + + let handle = rt.handle(); + + rt.block_on(async move { + // this will not return until all threads are spun up and actually executing the code + // waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d. + let consumed = + SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads0(handle, 1).await; + + println!("consumed"); + + let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || { + // this will not get to run before we release + })); + + println!("spawned"); + + tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh) + .await + .expect_err("the task should not have gotten to run yet"); + + println!("tried to join"); + + consumed.release().await; + + println!("released"); + + tokio::time::timeout(std::time::Duration::from_secs(1), jh) + .await + .expect("no timeout") + .expect("no join error"); + + println!("joined"); + }); +} diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index bf24407fc5..a89b66e4a1 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -5,7 +5,7 @@ use utils::{id::TimelineId, lsn::Lsn}; use crate::repository::Key; -use super::{DeltaFileName, ImageFileName, LayerFileName}; +use super::{DeltaLayerName, ImageLayerName, LayerName}; use serde::{Deserialize, Serialize}; @@ -15,7 +15,7 @@ use utils::id::TenantId; /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides /// a unified way to generate layer information like file name. -#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Hash)] pub struct PersistentLayerDesc { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, @@ -51,17 +51,17 @@ impl PersistentLayerDesc { } pub fn short_id(&self) -> impl Display { - self.filename() + self.layer_name() } #[cfg(test)] - pub fn new_test(key_range: Range) -> Self { + pub fn new_test(key_range: Range, lsn_range: Range, is_delta: bool) -> Self { Self { tenant_shard_id: TenantShardId::unsharded(TenantId::generate()), timeline_id: TimelineId::generate(), key_range, - lsn_range: Lsn(0)..Lsn(1), - is_delta: false, + lsn_range, + is_delta, file_size: 0, } } @@ -103,14 +103,14 @@ impl PersistentLayerDesc { pub fn from_filename( tenant_shard_id: TenantShardId, timeline_id: TimelineId, - filename: LayerFileName, + filename: LayerName, file_size: u64, ) -> Self { match filename { - LayerFileName::Image(i) => { + LayerName::Image(i) => { Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size) } - LayerFileName::Delta(d) => Self::new_delta( + LayerName::Delta(d) => Self::new_delta( tenant_shard_id, timeline_id, d.key_range, @@ -132,34 +132,34 @@ impl PersistentLayerDesc { lsn..(lsn + 1) } - /// Get a delta file name for this layer. + /// Get a delta layer name for this layer. /// /// Panic: if this is not a delta layer. - pub fn delta_file_name(&self) -> DeltaFileName { + pub fn delta_layer_name(&self) -> DeltaLayerName { assert!(self.is_delta); - DeltaFileName { + DeltaLayerName { key_range: self.key_range.clone(), lsn_range: self.lsn_range.clone(), } } - /// Get a delta file name for this layer. + /// Get a image layer name for this layer. /// /// Panic: if this is not an image layer, or the lsn range is invalid - pub fn image_file_name(&self) -> ImageFileName { + pub fn image_layer_name(&self) -> ImageLayerName { assert!(!self.is_delta); assert!(self.lsn_range.start + 1 == self.lsn_range.end); - ImageFileName { + ImageLayerName { key_range: self.key_range.clone(), lsn: self.lsn_range.start, } } - pub fn filename(&self) -> LayerFileName { + pub fn layer_name(&self) -> LayerName { if self.is_delta { - self.delta_file_name().into() + self.delta_layer_name().into() } else { - self.image_file_name().into() + self.image_layer_name().into() } } diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/layer_name.rs similarity index 58% rename from pageserver/src/tenant/storage_layer/filename.rs rename to pageserver/src/tenant/storage_layer/layer_name.rs index a98be0842b..da26e1eeb7 100644 --- a/pageserver/src/tenant/storage_layer/filename.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -2,40 +2,42 @@ //! Helper functions for dealing with filenames of the image and delta layer files. //! use crate::repository::Key; +use std::borrow::Cow; use std::cmp::Ordering; use std::fmt; use std::ops::Range; use std::str::FromStr; +use regex::Regex; use utils::lsn::Lsn; use super::PersistentLayerDesc; // Note: Timeline::load_layer_map() relies on this sort order #[derive(PartialEq, Eq, Clone, Hash)] -pub struct DeltaFileName { +pub struct DeltaLayerName { pub key_range: Range, pub lsn_range: Range, } -impl std::fmt::Debug for DeltaFileName { +impl std::fmt::Debug for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("DeltaFileName") + f.debug_struct("DeltaLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn_range", &self.lsn_range) .finish() } } -impl PartialOrd for DeltaFileName { +impl PartialOrd for DeltaLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for DeltaFileName { +impl Ord for DeltaLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -55,16 +57,14 @@ impl Ord for DeltaFileName { } } -/// Represents the filename of a DeltaLayer +/// Represents the region of the LSN-Key space covered by a DeltaLayer /// /// ```text /// -__- /// ``` -impl DeltaFileName { - /// - /// Parse a string as a delta file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl DeltaLayerName { + /// Parse the part of a delta layer's file name that represents the LayerName. Returns None + /// if the filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -74,10 +74,19 @@ impl DeltaFileName { let key_end_str = key_parts.next()?; let lsn_start_str = lsn_parts.next()?; let lsn_end_str = lsn_parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() { return None; } + if key_start_str.len() != 36 + || key_end_str.len() != 36 + || lsn_start_str.len() != 16 + || lsn_end_str.len() != 16 + { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; @@ -94,14 +103,14 @@ impl DeltaFileName { // or panic? } - Some(DeltaFileName { + Some(DeltaLayerName { key_range: key_start..key_end, lsn_range: start_lsn..end_lsn, }) } } -impl fmt::Display for DeltaFileName { +impl fmt::Display for DeltaLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -115,29 +124,29 @@ impl fmt::Display for DeltaFileName { } #[derive(PartialEq, Eq, Clone, Hash)] -pub struct ImageFileName { +pub struct ImageLayerName { pub key_range: Range, pub lsn: Lsn, } -impl std::fmt::Debug for ImageFileName { +impl std::fmt::Debug for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use super::RangeDisplayDebug; - f.debug_struct("ImageFileName") + f.debug_struct("ImageLayerName") .field("key_range", &RangeDisplayDebug(&self.key_range)) .field("lsn", &self.lsn) .finish() } } -impl PartialOrd for ImageFileName { +impl PartialOrd for ImageLayerName { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for ImageFileName { +impl Ord for ImageLayerName { fn cmp(&self, other: &Self) -> Ordering { let mut cmp = self.key_range.start.cmp(&other.key_range.start); if cmp != Ordering::Equal { @@ -153,7 +162,7 @@ impl Ord for ImageFileName { } } -impl ImageFileName { +impl ImageLayerName { pub fn lsn_as_range(&self) -> Range { // Saves from having to copypaste this all over PersistentLayerDesc::image_layer_lsn_range(self.lsn) @@ -161,16 +170,14 @@ impl ImageFileName { } /// -/// Represents the filename of an ImageLayer +/// Represents the part of the Key-LSN space covered by an ImageLayer /// /// ```text /// -__ /// ``` -impl ImageFileName { - /// - /// Parse a string as an image file name. Returns None if the filename does not - /// match the expected pattern. - /// +impl ImageLayerName { + /// Parse a string as then LayerName part of an image layer file name. Returns None if the + /// filename does not match the expected pattern. pub fn parse_str(fname: &str) -> Option { let mut parts = fname.split("__"); let mut key_parts = parts.next()?.split('-'); @@ -182,19 +189,23 @@ impl ImageFileName { return None; } + if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 { + return None; + } + let key_start = Key::from_hex(key_start_str).ok()?; let key_end = Key::from_hex(key_end_str).ok()?; let lsn = Lsn::from_hex(lsn_str).ok()?; - Some(ImageFileName { + Some(ImageLayerName { key_range: key_start..key_end, lsn, }) } } -impl fmt::Display for ImageFileName { +impl fmt::Display for ImageLayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, @@ -205,21 +216,24 @@ impl fmt::Display for ImageFileName { ) } } + +/// LayerName is the logical identity of a layer within a LayerMap at a moment in time. The +/// LayerName is not a unique filename, as the same LayerName may have multiple physical incarnations +/// over time (e.g. across shard splits or compression). The physical filenames of layers in local +/// storage and object names in remote storage consist of the LayerName plus some extra qualifiers +/// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path]) +/// and [`crate::tenant::storage_layer::layer::local_layer_path`]) #[derive(Debug, PartialEq, Eq, Hash, Clone)] -pub enum LayerFileName { - Image(ImageFileName), - Delta(DeltaFileName), +pub enum LayerName { + Image(ImageLayerName), + Delta(DeltaLayerName), } -impl LayerFileName { - pub fn file_name(&self) -> String { - self.to_string() - } - +impl LayerName { /// Determines if this layer file is considered to be in future meaning we will discard these /// layers during timeline initialization from the given disk_consistent_lsn. pub(crate) fn is_in_future(&self, disk_consistent_lsn: Lsn) -> bool { - use LayerFileName::*; + use LayerName::*; match self { Image(file_name) if file_name.lsn > disk_consistent_lsn => true, Delta(file_name) if file_name.lsn_range.end > disk_consistent_lsn + 1 => true, @@ -228,7 +242,7 @@ impl LayerFileName { } pub(crate) fn kind(&self) -> &'static str { - use LayerFileName::*; + use LayerName::*; match self { Delta(_) => "delta", Image(_) => "image", @@ -236,7 +250,7 @@ impl LayerFileName { } } -impl fmt::Display for LayerFileName { +impl fmt::Display for LayerName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Image(fname) => write!(f, "{fname}"), @@ -245,23 +259,36 @@ impl fmt::Display for LayerFileName { } } -impl From for LayerFileName { - fn from(fname: ImageFileName) -> Self { +impl From for LayerName { + fn from(fname: ImageLayerName) -> Self { Self::Image(fname) } } -impl From for LayerFileName { - fn from(fname: DeltaFileName) -> Self { +impl From for LayerName { + fn from(fname: DeltaLayerName) -> Self { Self::Delta(fname) } } -impl FromStr for LayerFileName { +impl FromStr for LayerName { type Err = String; + /// Conversion from either a physical layer filename, or the string-ization of + /// Self. When loading a physical layer filename, we drop any extra information + /// not needed to build Self. fn from_str(value: &str) -> Result { - let delta = DeltaFileName::parse_str(value); - let image = ImageFileName::parse_str(value); + let gen_suffix_regex = Regex::new("^(?.+)(?-v1-[0-9a-f]{8})$").unwrap(); + let file_name: Cow = match gen_suffix_regex.captures(value) { + Some(captures) => captures + .name("base") + .expect("Non-optional group") + .as_str() + .into(), + None => value.into(), + }; + + let delta = DeltaLayerName::parse_str(&file_name); + let image = ImageLayerName::parse_str(&file_name); let ok = match (delta, image) { (None, None) => { return Err(format!( @@ -276,7 +303,7 @@ impl FromStr for LayerFileName { } } -impl serde::Serialize for LayerFileName { +impl serde::Serialize for LayerName { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, @@ -288,19 +315,19 @@ impl serde::Serialize for LayerFileName { } } -impl<'de> serde::Deserialize<'de> for LayerFileName { +impl<'de> serde::Deserialize<'de> for LayerName { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { - deserializer.deserialize_string(LayerFileNameVisitor) + deserializer.deserialize_string(LayerNameVisitor) } } -struct LayerFileNameVisitor; +struct LayerNameVisitor; -impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { - type Value = LayerFileName; +impl<'de> serde::de::Visitor<'de> for LayerNameVisitor { + type Value = LayerName; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { write!( @@ -315,3 +342,38 @@ impl<'de> serde::de::Visitor<'de> for LayerFileNameVisitor { v.parse().map_err(|e| E::custom(e)) } } + +#[cfg(test)] +mod test { + use super::*; + #[test] + fn image_layer_parse() { + let expected = LayerName::Image(ImageLayerName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn: Lsn::from_hex("00000000014FED58").unwrap(), + }); + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap(); + assert_eq!(parsed, expected,); + + // Omitting generation suffix is valid + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap(); + assert_eq!(parsed, expected,); + } + + #[test] + fn delta_layer_parse() { + let expected = LayerName::Delta(DeltaLayerName { + key_range: Key::from_i128(0) + ..Key::from_hex("000000067F00000001000004DF0000000006").unwrap(), + lsn_range: Lsn::from_hex("00000000014FED58").unwrap() + ..Lsn::from_hex("000000000154C481").unwrap(), + }); + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap(); + assert_eq!(parsed, expected); + + // Omitting generation suffix is valid + let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap(); + assert_eq!(parsed, expected); + } +} diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index aa5894cc37..d679b78f32 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -2,6 +2,7 @@ //! such as compaction and GC use std::ops::ControlFlow; +use std::str::FromStr; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -9,14 +10,18 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD; +use crate::tenant::throttle::Stats; +use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; +use rand::Rng; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::{backoff, completion}; +use utils::{backoff, completion, pausable_failpoint}; static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let total_threads = task_mgr::TOKIO_WORKER_THREADS.get(); let permits = usize::max( 1, // while a lot of the work is done on spawn_blocking, we still do @@ -36,12 +41,13 @@ static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy &'static str { - let s: &'static str = self.into(); - s + self.into() } } +static PERMIT_GAUGES: once_cell::sync::Lazy< + enum_map::EnumMap, +> = once_cell::sync::Lazy::new(|| { + enum_map::EnumMap::from_array(std::array::from_fn(|i| { + let kind = ::from_usize(i); + crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE.with_label_values(&[kind.into()]) + })) +}); + /// Cancellation safe. pub(crate) async fn concurrent_background_tasks_rate_limit_permit( loop_kind: BackgroundLoopKind, _ctx: &RequestContext, -) -> impl Drop { - let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE - .with_label_values(&[loop_kind.as_static_str()]) - .guard(); +) -> tokio::sync::SemaphorePermit<'static> { + let _guard = PERMIT_GAUGES[loop_kind].guard(); + pausable_failpoint!( + "initial-size-calculation-permit-pause", + loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation + ); + + // TODO: assert that we run on BACKGROUND_RUNTIME; requires tokio_unstable Handle::id(); match CONCURRENT_BACKGROUND_TASKS.acquire().await { Ok(permit) => permit, Err(_closed) => unreachable!("we never close the semaphore"), @@ -94,6 +112,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; compaction_loop(tenant, cancel) + // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) @@ -123,6 +142,30 @@ pub fn start_background_loops( } }, ); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::IngestHousekeeping, + Some(tenant_shard_id), + None, + &format!("ingest housekeeping for tenant {tenant_shard_id}"), + false, + { + let tenant = Arc::clone(tenant); + let background_jobs_can_start = background_jobs_can_start.cloned(); + async move { + let cancel = task_mgr::shutdown_token(); + tokio::select! { + _ = cancel.cancelled() => { return Ok(()) }, + _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} + }; + ingest_housekeeping_loop(tenant, cancel) + .instrument(info_span!("ingest_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) + .await; + Ok(()) + } + }, + ); } /// @@ -133,6 +176,8 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { // How many errors we have seen consequtively let mut error_run_count = 0; + let mut last_throttle_flag_reset_at = Instant::now(); + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); @@ -176,8 +221,11 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { ); error_run_count += 1; let wait_duration = Duration::from_secs_f64(wait_duration); - error!( - "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}", + log_compaction_error( + &e, + error_run_count, + &wait_duration, + cancel.is_cancelled(), ); wait_duration } else { @@ -186,11 +234,38 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); + let elapsed = started_at.elapsed(); + warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction); + + // the duration is recorded by performance tests by enabling debug in this function + tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. - tenant.walredo_mgr.maybe_quiesce(period * 10); + if let Some(walredo_mgr) = &tenant.walredo_mgr { + walredo_mgr.maybe_quiesce(period * 10); + } + + // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off, + // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens. + info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { + let now = Instant::now(); + let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); + let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats(); + if count_throttled == 0 { + return; + } + let allowed_rps = tenant.timeline_get_throttle.steady_rps(); + let delta = now - prev; + info!( + n_seconds=%format_args!("{:.3}", + delta.as_secs_f64()), + count_accounted, + count_throttled, + sum_throttled_usecs, + allowed_rps=%format_args!("{allowed_rps:.0}"), + "shard was throttled in the last n_seconds") + }); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -205,6 +280,58 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } +fn log_compaction_error( + e: &CompactionError, + error_run_count: u32, + sleep_duration: &std::time::Duration, + task_cancelled: bool, +) { + use crate::tenant::upload_queue::NotInitialized; + use crate::tenant::PageReconstructError; + use CompactionError::*; + + enum LooksLike { + Info, + Error, + } + + let decision = match e { + ShuttingDown => None, + _ if task_cancelled => Some(LooksLike::Info), + Other(e) => { + let root_cause = e.root_cause(); + + let is_stopping = { + let upload_queue = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + + let timeline = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + + upload_queue || timeline + }; + + if is_stopping { + Some(LooksLike::Info) + } else { + Some(LooksLike::Error) + } + } + }; + + match decision { + Some(LooksLike::Info) => info!( + "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}", + ), + Some(LooksLike::Error) => error!( + "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}", + ), + None => {} + } +} + /// /// GC task's main loop /// @@ -219,6 +346,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { // cutoff specified as time. let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let mut first = true; loop { tokio::select! { @@ -235,6 +363,14 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { if first { first = false; + + if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel) + .await + .is_err() + { + break; + } + if random_init_delay(period, &cancel).await.is_err() { break; } @@ -253,21 +389,28 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { let res = tenant .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx) .await; - if let Err(e) = res { - let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count + 1, - 1.0, - MAX_BACKOFF_SECS, - ); - error_run_count += 1; - let wait_duration = Duration::from_secs_f64(wait_duration); - error!( + match res { + Ok(_) => { + error_run_count = 0; + period + } + Err(crate::tenant::GcError::TenantCancelled) => { + return; + } + Err(e) => { + let wait_duration = backoff::exponential_backoff_duration_seconds( + error_run_count + 1, + 1.0, + MAX_BACKOFF_SECS, + ); + error_run_count += 1; + let wait_duration = Duration::from_secs_f64(wait_duration); + + error!( "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}", ); - wait_duration - } else { - error_run_count = 0; - period + wait_duration + } } }; @@ -286,6 +429,61 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } +async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken) { + TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); + async { + loop { + tokio::select! { + _ = cancel.cancelled() => { + return; + }, + tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { + ControlFlow::Break(()) => return, + ControlFlow::Continue(()) => (), + }, + } + + // We run ingest housekeeping with the same frequency as compaction: it is not worth + // having a distinct setting. But we don't run it in the same task, because compaction + // blocks on acquiring the background job semaphore. + let period = tenant.get_compaction_period(); + + // If compaction period is set to zero (to disable it), then we will use a reasonable default + let period = if period == Duration::ZERO { + humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD) + .unwrap() + .into() + } else { + period + }; + + // Jitter the period by +/- 5% + let period = + rand::thread_rng().gen_range((period * (95)) / 100..(period * (105)) / 100); + + // Always sleep first: we do not need to do ingest housekeeping early in the lifetime of + // a tenant, since it won't have started writing any ephemeral files yet. + if tokio::time::timeout(period, cancel.cancelled()) + .await + .is_ok() + { + break; + } + + let started_at = Instant::now(); + tenant.ingest_housekeeping().await; + + warn_when_period_overrun( + started_at.elapsed(), + period, + BackgroundLoopKind::IngestHouseKeeping, + ); + } + } + .await; + TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); +} + async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { // if the tenant has a proper status already, no need to wait for anything if tenant.current_state() == TenantState::Active { @@ -327,8 +525,6 @@ pub(crate) async fn random_init_delay( period: Duration, cancel: &CancellationToken, ) -> Result<(), Cancelled> { - use rand::Rng; - if period == Duration::ZERO { return Ok(()); } @@ -344,6 +540,21 @@ pub(crate) async fn random_init_delay( } } +/// Delays GC by defaul lease length at restart. +/// +/// We do this as the leases mapping are not persisted to disk. By delaying GC by default +/// length, we gurantees that all the leases we granted before the restart will expire +/// when we run GC for the first time after the restart. +pub(crate) async fn delay_by_lease_length( + length: Duration, + cancel: &CancellationToken, +) -> Result<(), Cancelled> { + match tokio::time::timeout(length, cancel.cancelled()).await { + Ok(_) => Err(Cancelled), + Err(_) => Ok(()), + } +} + /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. pub(crate) fn warn_when_period_overrun( elapsed: Duration, diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs new file mode 100644 index 0000000000..f3f3d5e3ae --- /dev/null +++ b/pageserver/src/tenant/throttle.rs @@ -0,0 +1,178 @@ +use std::{ + str::FromStr, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::{Duration, Instant}, +}; + +use arc_swap::ArcSwap; +use enumset::EnumSet; +use tracing::{error, warn}; + +use crate::{context::RequestContext, task_mgr::TaskKind}; + +/// Throttle for `async` functions. +/// +/// Runtime reconfigurable. +/// +/// To share a throttle among multiple entities, wrap it in an [`Arc`]. +/// +/// The intial use case for this is tenant-wide throttling of getpage@lsn requests. +pub struct Throttle { + inner: ArcSwap, + metric: M, + /// will be turned into [`Stats::count_accounted`] + count_accounted: AtomicU64, + /// will be turned into [`Stats::count_throttled`] + count_throttled: AtomicU64, + /// will be turned into [`Stats::sum_throttled_usecs`] + sum_throttled_usecs: AtomicU64, +} + +pub struct Inner { + task_kinds: EnumSet, + rate_limiter: Arc, + config: Config, +} + +pub type Config = pageserver_api::models::ThrottleConfig; + +pub struct Observation { + pub wait_time: Duration, +} +pub trait Metric { + fn observe_throttling(&self, observation: &Observation); +} + +/// See [`Throttle::reset_stats`]. +pub struct Stats { + // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`]. + pub count_accounted: u64, + // Subset of the `accounted` requests that were actually throttled. + // Note that the numbers are stored as two independent atomics, so, there might be a slight drift. + pub count_throttled: u64, + // Sum of microseconds that throttled requests spent waiting for throttling. + pub sum_throttled_usecs: u64, +} + +impl Throttle +where + M: Metric, +{ + pub fn new(config: Config, metric: M) -> Self { + Self { + inner: ArcSwap::new(Arc::new(Self::new_inner(config))), + metric, + count_accounted: AtomicU64::new(0), + count_throttled: AtomicU64::new(0), + sum_throttled_usecs: AtomicU64::new(0), + } + } + fn new_inner(config: Config) -> Inner { + let Config { + task_kinds, + initial, + refill_interval, + refill_amount, + max, + fair, + } = &config; + let task_kinds: EnumSet = task_kinds + .iter() + .filter_map(|s| match TaskKind::from_str(s) { + Ok(v) => Some(v), + Err(e) => { + // TODO: avoid this failure mode + error!( + "cannot parse task kind, ignoring for rate limiting {}", + utils::error::report_compact_sources(&e) + ); + None + } + }) + .collect(); + Inner { + task_kinds, + rate_limiter: Arc::new( + leaky_bucket::RateLimiter::builder() + .initial(*initial) + .interval(*refill_interval) + .refill(refill_amount.get()) + .max(*max) + .fair(*fair) + .build(), + ), + config, + } + } + pub fn reconfigure(&self, config: Config) { + self.inner.store(Arc::new(Self::new_inner(config))); + } + + /// The [`Throttle`] keeps an internal flag that is true if there was ever any actual throttling. + /// This method allows retrieving & resetting that flag. + /// Useful for periodic reporting. + pub fn reset_stats(&self) -> Stats { + let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed); + let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed); + let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed); + Stats { + count_accounted, + count_throttled, + sum_throttled_usecs, + } + } + + /// See [`Config::steady_rps`]. + pub fn steady_rps(&self) -> f64 { + self.inner.load().config.steady_rps() + } + + pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option { + let inner = self.inner.load_full(); // clones the `Inner` Arc + if !inner.task_kinds.contains(ctx.task_kind()) { + return None; + }; + let start = std::time::Instant::now(); + let mut did_throttle = false; + let acquire = inner.rate_limiter.acquire(key_count); + // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate + let acquire = tokio::task::unconstrained(acquire); + let mut acquire = std::pin::pin!(acquire); + std::future::poll_fn(|cx| { + use std::future::Future; + let poll = acquire.as_mut().poll(cx); + did_throttle = did_throttle || poll.is_pending(); + poll + }) + .await; + self.count_accounted.fetch_add(1, Ordering::Relaxed); + if did_throttle { + self.count_throttled.fetch_add(1, Ordering::Relaxed); + let now = Instant::now(); + let wait_time = now - start; + self.sum_throttled_usecs + .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed); + let observation = Observation { wait_time }; + self.metric.observe_throttling(&observation); + match ctx.micros_spent_throttled.add(wait_time) { + Ok(res) => res, + Err(error) => { + use once_cell::sync::Lazy; + use utils::rate_limit::RateLimit; + static WARN_RATE_LIMIT: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut guard = WARN_RATE_LIMIT.lock().unwrap(); + guard.call(move || { + warn!(error, "error adding time spent throttled; this message is logged at a global rate limit"); + }); + } + } + Some(wait_time) + } else { + None + } + } +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 24a92859b7..a4f1108635 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,4 +1,7 @@ +pub(crate) mod analysis; +mod compaction; pub mod delete; +pub(crate) mod detach_ancestor; mod eviction_task; mod init; pub mod layer_manager; @@ -8,17 +11,25 @@ pub mod uninit; mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; +use arc_swap::ArcSwap; use bytes::Bytes; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; use enumset::EnumSet; use fail::fail_point; -use itertools::Itertools; +use once_cell::sync::Lazy; use pageserver_api::{ - models::{ - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo, - TimelineState, + key::{ + AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, + NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE, }, - shard::{ShardIdentity, TenantShardId}, + keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}, + models::{ + AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings, + DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, + InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState, + }, + reltag::BlockNumber, + shard::{ShardIdentity, ShardNumber, TenantShardId}, }; use rand::Rng; use serde_with::serde_as; @@ -29,46 +40,72 @@ use tokio::{ }; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::sync::gate::Gate; +use utils::{ + bin_ser::BeSer, + fs_ext, pausable_failpoint, + sync::gate::{Gate, GateGuard}, + vec_map::VecMap, +}; -use std::collections::{BinaryHeap, HashMap, HashSet}; -use std::ops::{Deref, Range}; use std::pin::pin; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; +use std::{ + array, + collections::{BTreeMap, HashMap, HashSet}, + sync::atomic::AtomicU64, +}; use std::{ cmp::{max, min, Ordering}, ops::ControlFlow, }; +use std::{ + collections::btree_map::Entry, + ops::{Deref, Range}, +}; -use crate::context::{ - AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder, +use crate::metrics::GetKind; +use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS; +use crate::{ + aux_file::AuxFileSizeEstimator, + tenant::{ + layer_map::{LayerMap, SearchResult}, + metadata::TimelineMetadata, + }, }; -use crate::tenant::storage_layer::delta_layer::DeltaEntry; -use crate::tenant::storage_layer::{ - AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, - LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult, - ValueReconstructState, +use crate::{ + context::{DownloadBehavior, RequestContext}, + disk_usage_eviction_task::DiskUsageEvictionInfo, + pgdatadir_mapping::CollectKeySpaceError, }; -use crate::tenant::tasks::BackgroundLoopKind; -use crate::tenant::timeline::logical_size::CurrentLogicalSize; -use crate::tenant::{ - layer_map::{LayerMap, SearchResult}, - metadata::{save_metadata, TimelineMetadata}, - par_fsync, +use crate::{ + disk_usage_eviction_task::finite_f32, + tenant::storage_layer::{ + AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer, + LayerAccessStatsReset, LayerName, ResidentLayer, ValueReconstructResult, + ValueReconstructState, ValuesReconstructState, + }, +}; +use crate::{ + disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, +}; +use crate::{ + metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, +}; +use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{ + pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, + virtual_file::{MaybeFatalIo, VirtualFile}, }; -use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError}; use crate::config::PageServerConf; -use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum}; +use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::{ TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, }; -use crate::pgdatadir_mapping::LsnForTimestamp; -use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; -use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; -use crate::tenant::config::{EvictionPolicy, TenantConfOpt}; +use crate::pgdatadir_mapping::CalculateLogicalSizeError; +use crate::tenant::config::TenantConfOpt; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIndex; @@ -97,14 +134,17 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::config::TenantConf; -use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart}; -use super::remote_timeline_client::RemoteTimelineClient; -use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; +use super::{config::TenantConf, storage_layer::VectoredValueReconstructState}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; +use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; +use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; +use super::{ + secondary::heatmap::{HeatMapLayer, HeatMapTimeline}, + GcError, +}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub(super) enum FlushLoopState { +pub(crate) enum FlushLoopState { NotStarted, Running { #[cfg(test)] @@ -115,9 +155,28 @@ pub(super) enum FlushLoopState { Exited, } +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum ImageLayerCreationMode { + /// Try to create image layers based on `time_for_new_image_layer`. Used in compaction code path. + Try, + /// Force creating the image layers if possible. For now, no image layers will be created + /// for metadata keys. Used in compaction code path with force flag enabled. + Force, + /// Initial ingestion of the data, and no data should be dropped in this function. This + /// means that no metadata keys should be included in the partitions. Used in flush frozen layer + /// code path. + Initial, +} + +impl std::fmt::Display for ImageLayerCreationMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap #[derive(Debug, Clone, PartialEq, Eq)] -pub struct Hole { +pub(crate) struct Hole { key_range: Range, coverage_size: usize, } @@ -148,13 +207,30 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { - pub remote_client: Option, - pub deletion_queue_client: DeletionQueueClient, + pub remote_client: RemoteTimelineClient, + pub timeline_get_throttle: Arc< + crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, + >, +} + +pub(crate) struct AuxFilesState { + pub(crate) dir: Option, + pub(crate) n_deltas: usize, +} + +/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL +/// ingestion considerably, because WAL ingestion needs to check on most records if the record +/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end +/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the +/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`]. +pub(crate) struct RelSizeCache { + pub(crate) complete_as_of: Lsn, + pub(crate) map: HashMap, } pub struct Timeline { conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, myself: Weak, @@ -193,27 +269,18 @@ pub struct Timeline { /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`. pub(crate) layers: Arc>, - /// Set of key ranges which should be covered by image layers to - /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored. - /// It is used by compaction task when it checks if new image layer should be created. - /// Newly created image layer doesn't help to remove the delta layer, until the - /// newly created image layer falls off the PITR horizon. So on next GC cycle, - /// gc_timeline may still want the new image layer to be created. To avoid redundant - /// image layers creation we should check if image layer exists but beyond PITR horizon. - /// This is why we need remember GC cutoff LSN. - /// - wanted_image_layers: Mutex>, - last_freeze_at: AtomicLsn, // Atomic would be more appropriate here. last_freeze_ts: RwLock, - // WAL redo manager - walredo_mgr: Arc, + pub(crate) standby_horizon: AtomicLsn, + + // WAL redo manager. `None` only for broken tenants. + walredo_mgr: Option>, /// Remote storage client. /// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details. - pub remote_client: Option>, + pub remote_client: Arc, // What page versions do we hold in the repository? If we get a // request > last_record_lsn, we need to wait until we receive all @@ -246,21 +313,31 @@ pub struct Timeline { pub(super) metrics: TimelineMetrics, + // `Timeline` doesn't write these metrics itself, but it manages the lifetime. Code + // in `crate::page_service` writes these metrics. + pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline, + + directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM], + /// Ensures layers aren't frozen by checkpointer between /// [`Timeline::get_layer_for_write`] and layer reads. /// Locked automatically by [`TimelineWriter`] and checkpointer. /// Must always be acquired before the layer map/individual layer lock /// to avoid deadlock. - write_lock: tokio::sync::Mutex<()>, + /// + /// The state is cleared upon freezing. + write_lock: tokio::sync::Mutex>, /// Used to avoid multiple `flush_loop` tasks running pub(super) flush_loop_state: Mutex, /// layer_flush_start_tx can be used to wake up the layer-flushing task. - /// The value is a counter, incremented every time a new flush cycle is requested. - /// The flush cycle counter is sent back on the layer_flush_done channel when - /// the flush finishes. You can use that to wait for the flush to finish. - layer_flush_start_tx: tokio::sync::watch::Sender, + /// - The u64 value is a counter, incremented every time a new flush cycle is requested. + /// The flush cycle counter is sent back on the layer_flush_done channel when + /// the flush finishes. You can use that to wait for the flush to finish. + /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn + /// read by whoever sends an update + layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>, /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, @@ -269,7 +346,7 @@ pub struct Timeline { // List of child timelines and their branch points. This is needed to avoid // garbage collecting data that is still needed by the child timelines. - pub gc_info: std::sync::RwLock, + pub(crate) gc_info: std::sync::RwLock, // It may change across major versions so for simplicity // keep it after running initdb for a timeline. @@ -279,12 +356,14 @@ pub struct Timeline { // though let's keep them both for better error visibility. pub initdb_lsn: Lsn, - /// When did we last calculate the partitioning? - partitioning: Mutex<(KeyPartitioning, Lsn)>, + /// When did we last calculate the partitioning? Make it pub to test cases. + pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>, /// Configuration: how often should the partitioning be recalculated. repartition_threshold: u64, + last_image_layer_creation_check_at: AtomicLsn, + /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, @@ -295,7 +374,7 @@ pub struct Timeline { pub walreceiver: Mutex>, /// Relation size cache - pub rel_size_cache: RwLock>, + pub(crate) rel_size_cache: RwLock, download_all_remote_layers_task_info: RwLock>, @@ -322,7 +401,7 @@ pub struct Timeline { /// /// Must only be taken in two places: /// - [`Timeline::compact`] (this file) - /// - [`delete::delete_local_layer_files`] + /// - [`delete::delete_local_timeline_directory`] /// /// Timeline deletion will acquire both compaction and gc locks in whatever order. compaction_lock: tokio::sync::Mutex<()>, @@ -331,10 +410,32 @@ pub struct Timeline { /// /// Must only be taken in two places: /// - [`Timeline::gc`] (this file) - /// - [`delete::delete_local_layer_files`] + /// - [`delete::delete_local_timeline_directory`] /// /// Timeline deletion will acquire both compaction and gc locks in whatever order. gc_lock: tokio::sync::Mutex<()>, + + /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction. + timeline_get_throttle: Arc< + crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, + >, + + /// Keep aux directory cache to avoid it's reconstruction on each update + pub(crate) aux_files: tokio::sync::Mutex, + + /// Size estimator for aux file v2 + pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, + + /// Indicate whether aux file v2 storage is enabled. + pub(crate) last_aux_file_policy: AtomicAuxFilePolicy, + + /// Some test cases directly place keys into the timeline without actually modifying the directory + /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that + /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense + /// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and + /// in the future, add `extra_test_sparse_keyspace` if necessary. + #[cfg(test)] + pub(crate) extra_test_dense_keyspace: ArcSwap, } pub struct WalReceiverInfo { @@ -343,33 +444,67 @@ pub struct WalReceiverInfo { pub last_received_msg_ts: u128, } -/// /// Information about how much history needs to be retained, needed by /// Garbage Collection. -/// -pub struct GcInfo { +#[derive(Default)] +pub(crate) struct GcInfo { /// Specific LSNs that are needed. /// /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub retain_lsns: Vec, + pub(crate) retain_lsns: Vec, - /// In addition to 'retain_lsns', keep everything newer than this - /// point. + /// The cutoff coordinates, which are combined by selecting the minimum. + pub(crate) cutoffs: GcCutoffs, + + /// Leases granted to particular LSNs. + pub(crate) leases: BTreeMap, +} + +impl GcInfo { + pub(crate) fn min_cutoff(&self) -> Lsn { + self.cutoffs.select_min() + } +} + +/// The `GcInfo` component describing which Lsns need to be retained. +#[derive(Debug)] +pub(crate) struct GcCutoffs { + /// Keep everything newer than this point. /// /// This is calculated by subtracting 'gc_horizon' setting from /// last-record LSN /// /// FIXME: is this inclusive or exclusive? - pub horizon_cutoff: Lsn, + pub(crate) horizon: Lsn, /// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this /// point. /// /// This is calculated by finding a number such that a record is needed for PITR /// if only if its LSN is larger than 'pitr_cutoff'. - pub pitr_cutoff: Lsn, + pub(crate) pitr: Lsn, +} + +impl Default for GcCutoffs { + fn default() -> Self { + Self { + horizon: Lsn::INVALID, + pitr: Lsn::INVALID, + } + } +} + +impl GcCutoffs { + fn select_min(&self) -> Lsn { + std::cmp::min(self.horizon, self.pitr) + } +} + +pub(crate) struct TimelineVisitOutcome { + completed_keyspace: KeySpace, + image_covered_keyspace: KeySpace, } /// An error happened in a get() operation. @@ -379,34 +514,164 @@ pub(crate) enum PageReconstructError { Other(#[from] anyhow::Error), #[error("Ancestor LSN wait error: {0}")] - AncestorLsnTimeout(#[from] WaitLsnError), + AncestorLsnTimeout(WaitLsnError), - /// The operation was cancelled - #[error("Cancelled")] + #[error("timeline shutting down")] Cancelled, - /// The ancestor of this is being stopped - #[error("ancestor timeline {0} is being stopped")] - AncestorStopping(TimelineId), - /// An error happened replaying WAL records #[error(transparent)] WalRedo(anyhow::Error), + + #[error("{0}")] + MissingKey(MissingKeyError), +} + +impl GetVectoredError { + #[cfg(test)] + pub(crate) fn is_missing_key_error(&self) -> bool { + matches!(self, Self::MissingKey(_)) + } +} + +#[derive(Debug)] +pub struct MissingKeyError { + key: Key, + shard: ShardNumber, + cont_lsn: Lsn, + request_lsn: Lsn, + ancestor_lsn: Option, + traversal_path: Vec, + backtrace: Option, +} + +impl std::fmt::Display for MissingKeyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}", + self.key, self.shard, self.cont_lsn, self.request_lsn + )?; + if let Some(ref ancestor_lsn) = self.ancestor_lsn { + write!(f, ", ancestor {}", ancestor_lsn)?; + } + + if !self.traversal_path.is_empty() { + writeln!(f)?; + } + + for (r, c, l) in &self.traversal_path { + writeln!( + f, + "layer traversal: result {:?}, cont_lsn {}, layer: {}", + r, c, l, + )?; + } + + if let Some(ref backtrace) = self.backtrace { + write!(f, "\n{}", backtrace)?; + } + + Ok(()) + } +} + +impl PageReconstructError { + /// Returns true if this error indicates a tenant/timeline shutdown alike situation + pub(crate) fn is_stopping(&self) -> bool { + use PageReconstructError::*; + match self { + Other(_) => false, + AncestorLsnTimeout(_) => false, + Cancelled => true, + WalRedo(_) => false, + MissingKey { .. } => false, + } + } } #[derive(thiserror::Error, Debug)] -enum FlushLayerError { - /// Timeline cancellation token was cancelled +pub(crate) enum CreateImageLayersError { #[error("timeline shutting down")] Cancelled, #[error(transparent)] - PageReconstructError(#[from] PageReconstructError), + GetVectoredError(GetVectoredError), + + #[error(transparent)] + PageReconstructError(PageReconstructError), #[error(transparent)] Other(#[from] anyhow::Error), } +#[derive(thiserror::Error, Debug, Clone)] +pub(crate) enum FlushLayerError { + /// Timeline cancellation token was cancelled + #[error("timeline shutting down")] + Cancelled, + + /// We tried to flush a layer while the Timeline is in an unexpected state + #[error("cannot flush frozen layers when flush_loop is not running, state is {0:?}")] + NotRunning(FlushLoopState), + + // Arc<> the following non-clonable error types: we must be Clone-able because the flush error is propagated from the flush + // loop via a watch channel, where we can only borrow it. + #[error(transparent)] + CreateImageLayersError(Arc), + + #[error(transparent)] + Other(#[from] Arc), +} + +impl FlushLayerError { + // When crossing from generic anyhow errors to this error type, we explicitly check + // for timeline cancellation to avoid logging inoffensive shutdown errors as warn/err. + fn from_anyhow(timeline: &Timeline, err: anyhow::Error) -> Self { + if timeline.cancel.is_cancelled() { + Self::Cancelled + } else { + Self::Other(Arc::new(err)) + } + } +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum GetVectoredError { + #[error("timeline shutting down")] + Cancelled, + + #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)] + Oversized(u64), + + #[error("Requested at invalid LSN: {0}")] + InvalidLsn(Lsn), + + #[error("Requested key not found: {0}")] + MissingKey(MissingKeyError), + + #[error(transparent)] + GetReadyAncestorError(GetReadyAncestorError), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum GetReadyAncestorError { + #[error("Ancestor LSN wait error: {0}")] + AncestorLsnTimeout(#[from] WaitLsnError), + + #[error("Bad state on timeline {timeline_id}: {state:?}")] + BadState { + timeline_id: TimelineId, + state: TimelineState, + }, + + #[error("Cancelled")] + Cancelled, +} + #[derive(Clone, Copy)] pub enum LogicalSizeCalculationCause { Initial, @@ -423,6 +688,7 @@ pub enum GetLogicalSizePriority { #[derive(enumset::EnumSetType)] pub(crate) enum CompactFlags { ForceRepartition, + ForceImageLayerCreation, } impl std::fmt::Debug for Timeline { @@ -438,30 +704,151 @@ pub(crate) enum WaitLsnError { Shutdown, // Called on an timeline not in active state or shutting down - #[error("Bad state (not active)")] - BadState, + #[error("Bad timeline state: {0:?}")] + BadState(TimelineState), // Timeout expired while waiting for LSN to catch up with goal. #[error("{0}")] Timeout(String), } +// The impls below achieve cancellation mapping for errors. +// Perhaps there's a way of achieving this with less cruft. + +impl From for CompactionError { + fn from(e: CreateImageLayersError) -> Self { + match e { + CreateImageLayersError::Cancelled => CompactionError::ShuttingDown, + _ => CompactionError::Other(e.into()), + } + } +} + +impl From for FlushLayerError { + fn from(e: CreateImageLayersError) -> Self { + match e { + CreateImageLayersError::Cancelled => FlushLayerError::Cancelled, + any => FlushLayerError::CreateImageLayersError(Arc::new(any)), + } + } +} + +impl From for CreateImageLayersError { + fn from(e: PageReconstructError) -> Self { + match e { + PageReconstructError::Cancelled => CreateImageLayersError::Cancelled, + _ => CreateImageLayersError::PageReconstructError(e), + } + } +} + +impl From for CreateImageLayersError { + fn from(e: GetVectoredError) -> Self { + match e { + GetVectoredError::Cancelled => CreateImageLayersError::Cancelled, + _ => CreateImageLayersError::GetVectoredError(e), + } + } +} + +impl From for PageReconstructError { + fn from(e: GetVectoredError) -> Self { + match e { + GetVectoredError::Cancelled => PageReconstructError::Cancelled, + GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")), + err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()), + GetVectoredError::MissingKey(err) => PageReconstructError::MissingKey(err), + GetVectoredError::GetReadyAncestorError(err) => PageReconstructError::from(err), + GetVectoredError::Other(err) => PageReconstructError::Other(err), + } + } +} + +impl From for PageReconstructError { + fn from(e: GetReadyAncestorError) -> Self { + use GetReadyAncestorError::*; + match e { + AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err), + bad_state @ BadState { .. } => PageReconstructError::Other(anyhow::anyhow!(bad_state)), + Cancelled => PageReconstructError::Cancelled, + } + } +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetVectoredImpl { + Sequential, + Vectored, +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetImpl { + Legacy, + Vectored, +} + +pub(crate) enum WaitLsnWaiter<'a> { + Timeline(&'a Timeline), + Tenant, + PageService, +} + +/// Argument to [`Timeline::shutdown`]. +#[derive(Debug, Clone, Copy)] +pub(crate) enum ShutdownMode { + /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then + /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// + /// While we are flushing, we continue to accept read I/O for LSNs ingested before + /// the call to [`Timeline::shutdown`]. + FreezeAndFlush, + /// Shut down immediately, without waiting for any open layers to flush. + Hard, +} + +struct ImageLayerCreationOutcome { + image: Option, + next_start_key: Key, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created - pub fn get_ancestor_lsn(&self) -> Lsn { + pub(crate) fn get_ancestor_lsn(&self) -> Lsn { self.ancestor_lsn } /// Get the ancestor's timeline id - pub fn get_ancestor_timeline_id(&self) -> Option { + pub(crate) fn get_ancestor_timeline_id(&self) -> Option { self.ancestor_timeline .as_ref() .map(|ancestor| ancestor.timeline_id) } /// Lock and get timeline's GC cutoff - pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { + pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { self.latest_gc_cutoff_lsn.read() } @@ -470,6 +857,8 @@ impl Timeline { /// If a remote layer file is needed, it is downloaded as part of this /// call. /// + /// This method enforces [`Self::timeline_get_throttle`] internally. + /// /// NOTE: It is considered an error to 'get' a key that doesn't exist. The /// abstraction above this needs to store suitable metadata to track what /// data exists with what keys, in separate metadata entries. If a @@ -480,6 +869,7 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. + #[inline(always)] pub(crate) async fn get( &self, key: Key, @@ -495,13 +885,7 @@ impl Timeline { // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); - // XXX: structured stats collection for layer eviction here. - trace!( - "get page request for {}@{} from task kind {:?}", - key, - lsn, - ctx.task_kind() - ); + self.timeline_get_throttle.throttle(ctx, 1).await; // Check the page cache. We will get back the most recent page with lsn <= `lsn`. // The cached image can be returned directly if there is no WAL between the cached image @@ -524,12 +908,88 @@ impl Timeline { None => None, }; - let mut reconstruct_state = ValueReconstructState { - records: Vec::new(), - img: cached_page_img, - }; + match self.conf.get_impl { + GetImpl::Legacy => { + let reconstruct_state = ValueReconstructState { + records: Vec::new(), + img: cached_page_img, + }; - let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer(); + self.get_impl(key, lsn, reconstruct_state, ctx).await + } + GetImpl::Vectored => { + let keyspace = KeySpace { + ranges: vec![key..key.next()], + }; + + // Initialise the reconstruct state for the key with the cache + // entry returned above. + let mut reconstruct_state = ValuesReconstructState::new(); + + // Only add the cached image to the reconstruct state when it exists. + if cached_page_img.is_some() { + let mut key_state = VectoredValueReconstructState::default(); + key_state.img = cached_page_img; + reconstruct_state.keys.insert(key, Ok(key_state)); + } + + let vectored_res = self + .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .await; + + if self.conf.validate_vectored_get { + self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) + .await; + } + + let key_value = vectored_res?.pop_first(); + match key_value { + Some((got_key, value)) => { + if got_key != key { + error!( + "Expected {}, but singular vectored get returned {}", + key, got_key + ); + Err(PageReconstructError::Other(anyhow!( + "Singular vectored get returned wrong key" + ))) + } else { + value + } + } + None => Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(0), + request_lsn: lsn, + ancestor_lsn: None, + traversal_path: Vec::new(), + backtrace: None, + })), + } + } + } + } + + /// Not subject to [`Self::timeline_get_throttle`]. + async fn get_impl( + &self, + key: Key, + lsn: Lsn, + mut reconstruct_state: ValueReconstructState, + ctx: &RequestContext, + ) -> Result { + // XXX: structured stats collection for layer eviction here. + trace!( + "get page request for {}@{} from task kind {:?}", + key, + lsn, + ctx.task_kind() + ); + + let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME + .for_get_kind(GetKind::Singular) + .start_timer(); let path = self .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) .await?; @@ -539,7 +999,7 @@ impl Timeline { let res = self.reconstruct_value(key, lsn, reconstruct_state).await; let elapsed = start.elapsed(); crate::metrics::RECONSTRUCT_TIME - .for_result(&res) + .for_get_kind(GetKind::Singular) .observe(elapsed.as_secs_f64()); if cfg!(feature = "testing") && res.is_err() { @@ -552,7 +1012,7 @@ impl Timeline { writeln!( msg, "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}", - layer(), + layer, ) .expect("string grows") }); @@ -565,63 +1025,446 @@ impl Timeline { res } + pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; + + /// Look up multiple page versions at a given LSN + /// + /// This naive implementation will be replaced with a more efficient one + /// which actually vectorizes the read path. + pub(crate) async fn get_vectored( + &self, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(lsn)); + } + + let key_count = keyspace.total_raw_size().try_into().unwrap(); + if key_count > Timeline::MAX_GET_VECTORED_KEYS { + return Err(GetVectoredError::Oversized(key_count)); + } + + for range in &keyspace.ranges { + let mut key = range.start; + while key != range.end { + assert!(!self.shard_identity.is_key_disposable(&key)); + key = key.next(); + } + } + + trace!( + "get vectored request for {:?}@{} from task kind {:?} will use {} implementation", + keyspace, + lsn, + ctx.task_kind(), + self.conf.get_vectored_impl + ); + + let start = crate::metrics::GET_VECTORED_LATENCY + .for_task_kind(ctx.task_kind()) + .map(|metric| (metric, Instant::now())); + + // start counting after throttle so that throttle time + // is always less than observation time + let throttled = self + .timeline_get_throttle + .throttle(ctx, key_count as usize) + .await; + + let res = match self.conf.get_vectored_impl { + GetVectoredImpl::Sequential => { + self.get_vectored_sequential_impl(keyspace, lsn, ctx).await + } + GetVectoredImpl::Vectored => { + let vectored_res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::new(), + ctx, + ) + .await; + + if self.conf.validate_vectored_get { + self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) + .await; + } + + vectored_res + } + }; + + if let Some((metric, start)) = start { + let elapsed = start.elapsed(); + let ex_throttled = if let Some(throttled) = throttled { + elapsed.checked_sub(throttled) + } else { + Some(elapsed) + }; + + if let Some(ex_throttled) = ex_throttled { + metric.observe(ex_throttled.as_secs_f64()); + } else { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + warn!("error deducting time spent throttled; this message is logged at a global rate limit"); + }); + } + } + + res + } + + /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored + /// get underlying. Normal vectored get would throw an error when a key in the keyspace is not found + /// during the search, but for the scan interface, it returns all existing key-value pairs, and does + /// not expect each single key in the key space will be found. The semantics is closer to the RocksDB + /// scan iterator interface. We could optimize this interface later to avoid some checks in the vectored + /// get path to maintain and split the probing and to-be-probe keyspace. We also need to ensure that + /// the scan operation will not cause OOM in the future. + #[allow(dead_code)] + pub(crate) async fn scan( + &self, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(lsn)); + } + + trace!( + "key-value scan request for {:?}@{} from task kind {:?}", + keyspace, + lsn, + ctx.task_kind() + ); + + // We should generalize this into Keyspace::contains in the future. + for range in &keyspace.ranges { + if range.start.field1 < METADATA_KEY_BEGIN_PREFIX + || range.end.field1 > METADATA_KEY_END_PREFIX + { + return Err(GetVectoredError::Other(anyhow::anyhow!( + "only metadata keyspace can be scanned" + ))); + } + } + + let start = crate::metrics::SCAN_LATENCY + .for_task_kind(ctx.task_kind()) + .map(ScanLatencyOngoingRecording::start_recording); + + // start counting after throttle so that throttle time + // is always less than observation time + let throttled = self + .timeline_get_throttle + // assume scan = 1 quota for now until we find a better way to process this + .throttle(ctx, 1) + .await; + + let vectored_res = self + .get_vectored_impl( + keyspace.clone(), + lsn, + &mut ValuesReconstructState::default(), + ctx, + ) + .await; + + if let Some(recording) = start { + recording.observe(throttled); + } + + vectored_res + } + + /// Not subject to [`Self::timeline_get_throttle`]. + pub(super) async fn get_vectored_sequential_impl( + &self, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + let mut values = BTreeMap::new(); + + for range in keyspace.ranges { + let mut key = range.start; + while key != range.end { + let block = self + .get_impl(key, lsn, ValueReconstructState::default(), ctx) + .await; + + use PageReconstructError::*; + match block { + Err(Cancelled) => return Err(GetVectoredError::Cancelled), + Err(MissingKey(_)) + if NON_INHERITED_RANGE.contains(&key) + || NON_INHERITED_SPARSE_RANGE.contains(&key) => + { + // Ignore missing key error for aux key range. TODO: currently, we assume non_inherited_range == aux_key_range. + // When we add more types of keys into the page server, we should revisit this part of code and throw errors + // accordingly. + key = key.next(); + } + Err(MissingKey(err)) => { + return Err(GetVectoredError::MissingKey(err)); + } + Err(Other(err)) + if err + .to_string() + .contains("downloading evicted layer file failed") => + { + return Err(GetVectoredError::Other(err)) + } + Err(Other(err)) + if err + .chain() + .any(|cause| cause.to_string().contains("layer loading failed")) => + { + // The intent here is to achieve error parity with the vectored read path. + // When vectored read fails to load a layer it fails the whole read, hence + // we mimic this behaviour here to keep the validation happy. + return Err(GetVectoredError::Other(err)); + } + _ => { + values.insert(key, block); + key = key.next(); + } + } + } + } + + Ok(values) + } + + pub(super) async fn get_vectored_impl( + &self, + keyspace: KeySpace, + lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result>, GetVectoredError> { + let get_kind = if keyspace.total_raw_size() == 1 { + GetKind::Singular + } else { + GetKind::Vectored + }; + + let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME + .for_get_kind(get_kind) + .start_timer(); + self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx) + .await?; + get_data_timer.stop_and_record(); + + let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME + .for_get_kind(get_kind) + .start_timer(); + let mut results: BTreeMap> = BTreeMap::new(); + let layers_visited = reconstruct_state.get_layers_visited(); + + for (key, res) in std::mem::take(&mut reconstruct_state.keys) { + match res { + Err(err) => { + results.insert(key, Err(err)); + } + Ok(state) => { + let state = ValueReconstructState::from(state); + + let reconstruct_res = self.reconstruct_value(key, lsn, state).await; + results.insert(key, reconstruct_res); + } + } + } + reconstruct_timer.stop_and_record(); + + // For aux file keys (v1 or v2) the vectored read path does not return an error + // when they're missing. Instead they are omitted from the resulting btree + // (this is a requirement, not a bug). Skip updating the metric in these cases + // to avoid infinite results. + if !results.is_empty() { + // Note that this is an approximation. Tracking the exact number of layers visited + // per key requires virtually unbounded memory usage and is inefficient + // (i.e. segment tree tracking each range queried from a layer) + crate::metrics::VEC_READ_NUM_LAYERS_VISITED + .observe(layers_visited as f64 / results.len() as f64); + } + + Ok(results) + } + + /// Not subject to [`Self::timeline_get_throttle`]. + pub(super) async fn validate_get_vectored_impl( + &self, + vectored_res: &Result>, GetVectoredError>, + keyspace: KeySpace, + lsn: Lsn, + ctx: &RequestContext, + ) { + if keyspace.overlaps(&Key::metadata_key_range()) { + // skip validation for metadata key range + return; + } + + let sequential_res = self + .get_vectored_sequential_impl(keyspace.clone(), lsn, ctx) + .await; + + fn errors_match(lhs: &GetVectoredError, rhs: &GetVectoredError) -> bool { + use GetVectoredError::*; + match (lhs, rhs) { + (Oversized(l), Oversized(r)) => l == r, + (InvalidLsn(l), InvalidLsn(r)) => l == r, + (MissingKey(l), MissingKey(r)) => l.key == r.key, + (GetReadyAncestorError(_), GetReadyAncestorError(_)) => true, + (Other(_), Other(_)) => true, + _ => false, + } + } + + match (&sequential_res, vectored_res) { + (Err(GetVectoredError::Cancelled), _) => {}, + (_, Err(GetVectoredError::Cancelled)) => {}, + (Err(seq_err), Ok(_)) => { + panic!(concat!("Sequential get failed with {}, but vectored get did not", + " - keyspace={:?} lsn={}"), + seq_err, keyspace, lsn) }, + (Ok(_), Err(GetVectoredError::GetReadyAncestorError(GetReadyAncestorError::AncestorLsnTimeout(_)))) => { + // Sequential get runs after vectored get, so it is possible for the later + // to time out while waiting for its ancestor's Lsn to become ready and for the + // former to succeed (it essentially has a doubled wait time). + }, + (Ok(_), Err(vec_err)) => { + panic!(concat!("Vectored get failed with {}, but sequential get did not", + " - keyspace={:?} lsn={}"), + vec_err, keyspace, lsn) }, + (Err(seq_err), Err(vec_err)) => { + assert!(errors_match(seq_err, vec_err), + "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")}, + (Ok(seq_values), Ok(vec_values)) => { + seq_values.iter().zip(vec_values.iter()).for_each(|((seq_key, seq_res), (vec_key, vec_res))| { + assert_eq!(seq_key, vec_key); + match (seq_res, vec_res) { + (Ok(seq_blob), Ok(vec_blob)) => { + Self::validate_key_equivalence(seq_key, &keyspace, lsn, seq_blob, vec_blob); + }, + (Err(err), Ok(_)) => { + panic!( + concat!("Sequential get failed with {} for key {}, but vectored get did not", + " - keyspace={:?} lsn={}"), + err, seq_key, keyspace, lsn) }, + (Ok(_), Err(err)) => { + panic!( + concat!("Vectored get failed with {} for key {}, but sequential get did not", + " - keyspace={:?} lsn={}"), + err, seq_key, keyspace, lsn) }, + (Err(_), Err(_)) => {} + } + }) + } + } + } + + fn validate_key_equivalence( + key: &Key, + keyspace: &KeySpace, + lsn: Lsn, + seq: &Bytes, + vec: &Bytes, + ) { + if *key == AUX_FILES_KEY { + // The value reconstruct of AUX_FILES_KEY from records is not deterministic + // since it uses a hash map under the hood. Hence, deserialise both results + // before comparing. + let seq_aux_dir_res = AuxFilesDirectory::des(seq); + let vec_aux_dir_res = AuxFilesDirectory::des(vec); + match (&seq_aux_dir_res, &vec_aux_dir_res) { + (Ok(seq_aux_dir), Ok(vec_aux_dir)) => { + assert_eq!( + seq_aux_dir, vec_aux_dir, + "Mismatch for key {} - keyspace={:?} lsn={}", + key, keyspace, lsn + ); + } + (Err(_), Err(_)) => {} + _ => { + panic!("Mismatch for {key}: {seq_aux_dir_res:?} != {vec_aux_dir_res:?}"); + } + } + } else { + // All other keys should reconstruct deterministically, so we simply compare the blobs. + assert_eq!( + seq, vec, + "Image mismatch for key {key} - keyspace={keyspace:?} lsn={lsn}" + ); + } + } + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. - pub fn get_last_record_lsn(&self) -> Lsn { + pub(crate) fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load().last } - pub fn get_prev_record_lsn(&self) -> Lsn { + pub(crate) fn get_prev_record_lsn(&self) -> Lsn { self.last_record_lsn.load().prev } /// Atomically get both last and prev. - pub fn get_last_record_rlsn(&self) -> RecordLsn { + pub(crate) fn get_last_record_rlsn(&self) -> RecordLsn { self.last_record_lsn.load() } - pub fn get_disk_consistent_lsn(&self) -> Lsn { + /// Subscribe to callers of wait_lsn(). The value of the channel is None if there are no + /// wait_lsn() calls in progress, and Some(Lsn) if there is an active waiter for wait_lsn(). + pub(crate) fn subscribe_for_wait_lsn_updates(&self) -> watch::Receiver> { + self.last_record_lsn.status_receiver() + } + + pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } /// remote_consistent_lsn from the perspective of the tenant's current generation, /// not validated with control plane yet. /// See [`Self::get_remote_consistent_lsn_visible`]. - pub fn get_remote_consistent_lsn_projected(&self) -> Option { - if let Some(remote_client) = &self.remote_client { - remote_client.remote_consistent_lsn_projected() - } else { - None - } + pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option { + self.remote_client.remote_consistent_lsn_projected() } /// remote_consistent_lsn which the tenant is guaranteed not to go backward from, /// i.e. a value of remote_consistent_lsn_projected which has undergone /// generation validation in the deletion queue. - pub fn get_remote_consistent_lsn_visible(&self) -> Option { - if let Some(remote_client) = &self.remote_client { - remote_client.remote_consistent_lsn_visible() - } else { - None - } + pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option { + self.remote_client.remote_consistent_lsn_visible() } /// The sum of the file size of all historic layers in the layer map. /// This method makes no distinction between local and remote layers. /// Hence, the result **does not represent local filesystem usage**. - pub async fn layer_size_sum(&self) -> u64 { + pub(crate) async fn layer_size_sum(&self) -> u64 { let guard = self.layers.read().await; let layer_map = guard.layer_map(); let mut size = 0; for l in layer_map.iter_historic_layers() { - size += l.file_size(); + size += l.file_size; } size } - pub fn resident_physical_size(&self) -> u64 { + pub(crate) fn resident_physical_size(&self) -> u64 { self.metrics.resident_physical_size_get() } + pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] { + array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed)) + } + /// /// Wait until WAL has been received and processed up to this LSN. /// @@ -631,28 +1474,38 @@ impl Timeline { pub(crate) async fn wait_lsn( &self, lsn: Lsn, - _ctx: &RequestContext, /* Prepare for use by cancellation */ + who_is_waiting: WaitLsnWaiter<'_>, + ctx: &RequestContext, /* Prepare for use by cancellation */ ) -> Result<(), WaitLsnError> { - if self.cancel.is_cancelled() { + let state = self.current_state(); + if self.cancel.is_cancelled() || matches!(state, TimelineState::Stopping) { return Err(WaitLsnError::Shutdown); - } else if !self.is_active() { - return Err(WaitLsnError::BadState); + } else if !matches!(state, TimelineState::Active) { + return Err(WaitLsnError::BadState(state)); } - // This should never be called from the WAL receiver, because that could lead - // to a deadlock. - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller), - "wait_lsn cannot be called in WAL receiver" - ); + if cfg!(debug_assertions) { + match ctx.task_kind() { + TaskKind::WalReceiverManager + | TaskKind::WalReceiverConnectionHandler + | TaskKind::WalReceiverConnectionPoller => { + let is_myself = match who_is_waiting { + WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), + WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + }; + if is_myself { + if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { + // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here + panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"); + } + } else { + // if another timeline's is waiting for us, there's no deadlock risk because + // our walreceiver task can make progress independent of theirs + } + } + _ => {} + } + } let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); @@ -694,7 +1547,7 @@ impl Timeline { } /// Check that it is valid to request operations with that lsn. - pub fn check_lsn_is_in_scope( + pub(crate) fn check_lsn_is_in_scope( &self, lsn: Lsn, latest_gc_cutoff_lsn: &RcuReadGuard, @@ -708,11 +1561,182 @@ impl Timeline { Ok(()) } + /// Obtains a temporary lease blocking garbage collection for the given LSN. + /// + /// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also + /// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if + /// the request extends the lease. The returned lease is therefore the maximum between the existing lease and + /// the requesting lease. + pub(crate) fn make_lsn_lease( + &self, + lsn: Lsn, + length: Duration, + _ctx: &RequestContext, + ) -> anyhow::Result { + let lease = { + let mut gc_info = self.gc_info.write().unwrap(); + + let valid_until = SystemTime::now() + length; + + let entry = gc_info.leases.entry(lsn); + + let lease = { + if let Entry::Occupied(mut occupied) = entry { + let existing_lease = occupied.get_mut(); + if valid_until > existing_lease.valid_until { + existing_lease.valid_until = valid_until; + } + existing_lease.clone() + } else { + // Reject already GC-ed LSN (lsn < latest_gc_cutoff) + let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn(); + if lsn < *latest_gc_cutoff_lsn { + bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); + } + + entry.or_insert(LsnLease { valid_until }).clone() + } + }; + + lease + }; + + Ok(lease) + } + /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] - pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { - self.freeze_inmem_layer(false).await; - self.flush_frozen_layers_and_wait().await + pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> { + self.freeze_and_flush0().await + } + + // This exists to provide a non-span creating version of `freeze_and_flush` we can call without + // polluting the span hierarchy. + pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> { + let to_lsn = { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let mut g = self.write_lock.lock().await; + + let to_lsn = self.get_last_record_lsn(); + self.freeze_inmem_layer_at(to_lsn, &mut g).await; + to_lsn + }; + self.flush_frozen_layers_and_wait(to_lsn).await + } + + // Check if an open ephemeral layer should be closed: this provides + // background enforcement of checkpoint interval if there is no active WAL receiver, to avoid keeping + // an ephemeral layer open forever when idle. It also freezes layers if the global limit on + // ephemeral layer bytes has been breached. + pub(super) async fn maybe_freeze_ephemeral_layer(&self) { + let Ok(mut write_guard) = self.write_lock.try_lock() else { + // If the write lock is held, there is an active wal receiver: rolling open layers + // is their responsibility while they hold this lock. + return; + }; + + let Ok(layers_guard) = self.layers.try_read() else { + // Don't block if the layer lock is busy + return; + }; + + let Some(open_layer) = &layers_guard.layer_map().open_layer else { + // If there is no open layer, we have no layer freezing to do. However, we might need to generate + // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions + // that didn't result in writes to this shard. + + // Must not hold the layers lock while waiting for a flush. + drop(layers_guard); + + let last_record_lsn = self.get_last_record_lsn(); + let disk_consistent_lsn = self.get_disk_consistent_lsn(); + if last_record_lsn > disk_consistent_lsn { + // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates + // we are a sharded tenant and have skipped some WAL + let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); + if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { + // Only do this if have been layer-less longer than get_checkpoint_timeout, so that a shard + // without any data ingested (yet) doesn't write a remote index as soon as it + // sees its LSN advance: we only do this if we've been layer-less + // for some time. + tracing::debug!( + "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}", + disk_consistent_lsn, + last_record_lsn + ); + + // The flush loop will update remote consistent LSN as well as disk consistent LSN. + self.flush_frozen_layers_and_wait(last_record_lsn) + .await + .ok(); + } + } + + return; + }; + + let Some(current_size) = open_layer.try_len() else { + // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so + // read lock to get size should always succeed. + tracing::warn!("Lock conflict while reading size of open layer"); + return; + }; + + let current_lsn = self.get_last_record_lsn(); + + let checkpoint_distance_override = open_layer.tick().await; + + if let Some(size_override) = checkpoint_distance_override { + if current_size > size_override { + // This is not harmful, but it only happens in relatively rare cases where + // time-based checkpoints are not happening fast enough to keep the amount of + // ephemeral data within configured limits. It's a sign of stress on the system. + tracing::info!("Early-rolling open layer at size {current_size} (limit {size_override}) due to dirty data pressure"); + } + } + + let checkpoint_distance = + checkpoint_distance_override.unwrap_or(self.get_checkpoint_distance()); + + if self.should_roll( + current_size, + current_size, + checkpoint_distance, + self.get_last_record_lsn(), + self.last_freeze_at.load(), + open_layer.get_opened_at(), + ) { + let at_lsn = match open_layer.info() { + InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { + // We may reach this point if the layer was already frozen by not yet flushed: flushing + // happens asynchronously in the background. + tracing::debug!( + "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})" + ); + None + } + InMemoryLayerInfo::Open { .. } => { + // Upgrade to a write lock and freeze the layer + drop(layers_guard); + let mut layers_guard = self.layers.write().await; + let froze = layers_guard + .try_freeze_in_memory_layer( + current_lsn, + &self.last_freeze_at, + &mut write_guard, + ) + .await; + Some(current_lsn).filter(|_| froze) + } + }; + if let Some(lsn) = at_lsn { + let res: Result = self.flush_frozen_layers(lsn); + if let Err(e) = res { + tracing::info!("failed to flush frozen layer after background freeze: {e:#}"); + } + } + } } /// Outermost timeline compaction operation; downloads needed layers. @@ -753,195 +1777,103 @@ impl Timeline { return Ok(()); } - // High level strategy for compaction / image creation: - // - // 1. First, calculate the desired "partitioning" of the - // currently in-use key space. The goal is to partition the - // key space into roughly fixed-size chunks, but also take into - // account any existing image layers, and try to align the - // chunk boundaries with the existing image layers to avoid - // too much churn. Also try to align chunk boundaries with - // relation boundaries. In principle, we don't know about - // relation boundaries here, we just deal with key-value - // pairs, and the code in pgdatadir_mapping.rs knows how to - // map relations into key-value pairs. But in practice we know - // that 'field6' is the block number, and the fields 1-5 - // identify a relation. This is just an optimization, - // though. - // - // 2. Once we know the partitioning, for each partition, - // decide if it's time to create a new image layer. The - // criteria is: there has been too much "churn" since the last - // image layer? The "churn" is fuzzy concept, it's a - // combination of too many delta files, or too much WAL in - // total in the delta file. Or perhaps: if creating an image - // file would allow to delete some older files. - // - // 3. After that, we compact all level0 delta files if there - // are too many of them. While compacting, we also garbage - // collect any page versions that are no longer needed because - // of the new image layers we created in step 2. - // - // TODO: This high level strategy hasn't been implemented yet. - // Below are functions compact_level0() and create_image_layers() - // but they are a bit ad hoc and don't quite work like it's explained - // above. Rewrite it. - - // Is the timeline being deleted? - if self.is_stopping() { - trace!("Dropping out of compaction on timeline shutdown"); - return Err(CompactionError::ShuttingDown); + match self.get_compaction_algorithm_settings().kind { + CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await, + CompactionAlgorithm::Legacy => self.compact_legacy(cancel, flags, ctx).await, } - - let target_file_size = self.get_checkpoint_distance(); - - // Define partitioning schema if needed - - // FIXME: the match should only cover repartitioning, not the next steps - match self - .repartition( - self.get_last_record_lsn(), - self.get_compaction_target_size(), - flags, - ctx, - ) - .await - { - Ok((partitioning, lsn)) => { - // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them - let image_ctx = RequestContextBuilder::extend(ctx) - .access_stats_behavior(AccessStatsBehavior::Skip) - .build(); - - // 2. Compact - let timer = self.metrics.compact_time_histo.start_timer(); - self.compact_level0(target_file_size, ctx).await?; - timer.stop_and_record(); - - // 3. Create new image layers for partitions that have been modified - // "enough". - let layers = self - .create_image_layers(&partitioning, lsn, false, &image_ctx) - .await - .map_err(anyhow::Error::from)?; - if let Some(remote_client) = &self.remote_client { - for layer in layers { - remote_client.schedule_layer_file_upload(layer)?; - } - } - - if let Some(remote_client) = &self.remote_client { - // should any new image layer been created, not uploading index_part will - // result in a mismatch between remote_physical_size and layermap calculated - // size, which will fail some tests, but should not be an issue otherwise. - remote_client.schedule_index_upload_for_file_changes()?; - } - } - Err(err) => { - // no partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline - // as a simple key-value store, ignoring the datadir layout. Log the - // error but continue. - // - // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() { - error!("could not compact, repartitioning keyspace failed: {err:?}"); - } - } - }; - - Ok(()) } /// Mutate the timeline with a [`TimelineWriter`]. - pub async fn writer(&self) -> TimelineWriter<'_> { + pub(crate) async fn writer(&self) -> TimelineWriter<'_> { TimelineWriter { tl: self, - _write_guard: self.write_lock.lock().await, + write_guard: self.write_lock.lock().await, } } - /// Check if more than 'checkpoint_distance' of WAL has been accumulated in - /// the in-memory layer, and initiate flushing it if so. - /// - /// Also flush after a period of time without new data -- it helps - /// safekeepers to regard pageserver as caught up and suspend activity. - pub async fn check_checkpoint_distance(self: &Arc) -> anyhow::Result<()> { - let last_lsn = self.get_last_record_lsn(); - let open_layer_size = { - let guard = self.layers.read().await; - let layers = guard.layer_map(); - let Some(open_layer) = layers.open_layer.as_ref() else { - return Ok(()); - }; - open_layer.size().await? - }; - let last_freeze_at = self.last_freeze_at.load(); - let last_freeze_ts = *(self.last_freeze_ts.read().unwrap()); - let distance = last_lsn.widening_sub(last_freeze_at); - // Checkpointing the open layer can be triggered by layer size or LSN range. - // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and - // we want to stay below that with a big margin. The LSN distance determines how - // much WAL the safekeepers need to store. - if distance >= self.get_checkpoint_distance().into() - || open_layer_size > self.get_checkpoint_distance() - || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout()) - { - info!( - "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}", - distance, - open_layer_size, - last_freeze_ts.elapsed() - ); - - self.freeze_inmem_layer(true).await; - self.last_freeze_at.store(last_lsn); - *(self.last_freeze_ts.write().unwrap()) = Instant::now(); - - // Wake up the layer flusher - self.flush_frozen_layers(); - } - Ok(()) - } - - pub fn activate( + pub(crate) fn activate( self: &Arc, + parent: Arc, broker_client: BrokerClientChannel, background_jobs_can_start: Option<&completion::Barrier>, ctx: &RequestContext, ) { - self.spawn_initial_logical_size_computation_task(ctx); + if self.tenant_shard_id.is_shard_zero() { + // Logical size is only maintained accurately on shard zero. + self.spawn_initial_logical_size_computation_task(ctx); + } self.launch_wal_receiver(ctx, broker_client); self.set_state(TimelineState::Active); - self.launch_eviction_task(background_jobs_can_start); + self.launch_eviction_task(parent, background_jobs_can_start); } - /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then - /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// After this function returns, there are no timeline-scoped tasks are left running. /// - /// While we are flushing, we continue to accept read I/O. - #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] - pub(crate) async fn flush_and_shutdown(&self) { + /// The preferred pattern for is: + /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token + /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required, + /// go the extra mile and keep track of JoinHandles + /// - Keep track of JoinHandles using a passed-down `Arc>>` or similar, + /// instead of spawning directly on a runtime. It is a more composable / testable pattern. + /// + /// For legacy reasons, we still have multiple tasks spawned using + /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`. + /// We refer to these as "timeline-scoped task_mgr tasks". + /// Some of these tasks are already sensitive to Timeline::cancel while others are + /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`] + /// or [`task_mgr::shutdown_watcher`]. + /// We want to gradually convert the code base away from these. + /// + /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to + /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped + /// ones that aren't mentioned here): + /// - [`TaskKind::TimelineDeletionWorker`] + /// - NB: also used for tenant deletion + /// - [`TaskKind::RemoteUploadTask`]` + /// - [`TaskKind::InitialLogicalSizeCalculation`] + /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?) + // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive: + /// - [`TaskKind::Eviction`] + /// - [`TaskKind::LayerFlushTask`] + /// - [`TaskKind::OndemandLogicalSizeCalculation`] + /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped) + pub(crate) async fn shutdown(&self, mode: ShutdownMode) { debug_assert_current_span_has_tenant_and_timeline_id(); - // Stop ingesting data, so that we are not still writing to an InMemoryLayer while - // trying to flush - tracing::debug!("Waiting for WalReceiverManager..."); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + let try_freeze_and_flush = match mode { + ShutdownMode::FreezeAndFlush => true, + ShutdownMode::Hard => false, + }; - // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance + // Regardless of whether we're going to try_freeze_and_flush + // or not, stop ingesting any more data. Walreceiver only provides + // cancellation but no "wait until gone", because it uses the Timeline::gate. + // So, only after the self.gate.close() below will we know for sure that + // no walreceiver tasks are left. + // For `try_freeze_and_flush=true`, this means that we might still be ingesting + // data during the call to `self.freeze_and_flush()` below. + // That's not ideal, but, we don't have the concept of a ChildGuard, + // which is what we'd need to properly model early shutdown of the walreceiver + // task sub-tree before the other Timeline task sub-trees. + let walreceiver = self.walreceiver.lock().unwrap().take(); + tracing::debug!( + is_some = walreceiver.is_some(), + "Waiting for WalReceiverManager..." + ); + if let Some(walreceiver) = walreceiver { + walreceiver.cancel(); + } + // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); - // now all writers to InMemory layer are gone, do the final flush if requested - match self.freeze_and_flush().await { - Ok(_) => { - // drain the upload queue - if let Some(client) = self.remote_client.as_ref() { + if try_freeze_and_flush { + // we shut down walreceiver above, so, we won't add anything more + // to the InMemoryLayer; freeze it and wait for all frozen layers + // to reach the disk & upload queue, then shut the upload queue and + // wait for it to drain. + match self.freeze_and_flush().await { + Ok(_) => { + // drain the upload queue // if we did not wait for completion here, it might be our shutdown process // didn't wait for remote uploads to complete at all, as new tasks can forever // be spawned. @@ -949,62 +1881,47 @@ impl Timeline { // what is problematic is the shutting down of RemoteTimelineClient, because // obviously it does not make sense to stop while we wait for it, but what // about corner cases like s3 suddenly hanging up? - if let Err(e) = client.shutdown().await { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to flush to remote storage: {e:#}"); - } + self.remote_client.shutdown().await; + } + Err(e) => { + // Non-fatal. Shutdown is infallible. Failures to flush just mean that + // we have some extra WAL replay to do next time the timeline starts. + warn!("failed to freeze and flush: {e:#}"); } - } - Err(e) => { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to freeze and flush: {e:#}"); } } - self.shutdown().await; - } - - /// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of - /// the graceful [`Timeline::flush_and_shutdown`] function. - pub(crate) async fn shutdown(&self) { // Signal any subscribers to our cancellation token to drop out tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); - // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel - // while doing so. - self.last_record_lsn.shutdown(); - - // Shut down the layer flush task before the remote client, as one depends on the other + // Transition the remote_client into a state where it's only useful for timeline deletion. + // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) + self.remote_client.stop(); + // As documented in remote_client.stop()'s doc comment, it's our responsibility + // to shut down the upload queue tasks. + // TODO: fix that, task management should be encapsulated inside remote_client. task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), + Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(self.timeline_id), ) .await; - // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in - // case our caller wants to use that for a deletion - if let Some(remote_client) = self.remote_client.as_ref() { - match remote_client.stop() { - Ok(()) => {} - Err(StopError::QueueUninitialized) => { - // Shutting down during initialization is legal - } - } - } - + // TODO: work toward making this a no-op. See this funciton's doc comment for more context. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; - // Finally wait until any gate-holders are complete + // Finally wait until any gate-holders are complete. + // + // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks + // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left. self.gate.close().await; + + self.metrics.shutdown(); } - pub fn set_state(&self, new_state: TimelineState) { + pub(crate) fn set_state(&self, new_state: TimelineState) { match (self.current_state(), new_state) { (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => { info!("Ignoring new state, equal to the existing one: {equal_state_2:?}"); @@ -1024,7 +1941,7 @@ impl Timeline { } } - pub fn set_broken(&self, reason: String) { + pub(crate) fn set_broken(&self, reason: String) { let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture()); let broken_state = TimelineState::Broken { reason, @@ -1038,27 +1955,27 @@ impl Timeline { self.cancel.cancel(); } - pub fn current_state(&self) -> TimelineState { + pub(crate) fn current_state(&self) -> TimelineState { self.state.borrow().clone() } - pub fn is_broken(&self) -> bool { + pub(crate) fn is_broken(&self) -> bool { matches!(&*self.state.borrow(), TimelineState::Broken { .. }) } - pub fn is_active(&self) -> bool { + pub(crate) fn is_active(&self) -> bool { self.current_state() == TimelineState::Active } - pub fn is_stopping(&self) -> bool { + pub(crate) fn is_stopping(&self) -> bool { self.current_state() == TimelineState::Stopping } - pub fn subscribe_for_state_updates(&self) -> watch::Receiver { + pub(crate) fn subscribe_for_state_updates(&self) -> watch::Receiver { self.state.subscribe() } - pub async fn wait_to_become_active( + pub(crate) async fn wait_to_become_active( &self, _ctx: &RequestContext, // Prepare for use by cancellation ) -> Result<(), TimelineState> { @@ -1083,7 +2000,7 @@ impl Timeline { } } - pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { + pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo { let guard = self.layers.read().await; let layer_map = guard.layer_map(); let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1); @@ -1107,15 +2024,14 @@ impl Timeline { } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] - pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result> { + pub(crate) async fn download_layer( + &self, + layer_file_name: &LayerName, + ) -> anyhow::Result> { let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None); }; - if self.remote_client.is_none() { - return Ok(Some(false)); - } - layer.download().await?; Ok(Some(true)) @@ -1124,7 +2040,10 @@ impl Timeline { /// Evict just one layer. /// /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`. - pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result> { + pub(crate) async fn evict_layer( + &self, + layer_file_name: &LayerName, + ) -> anyhow::Result> { let _gate = self .gate .enter() @@ -1134,15 +2053,61 @@ impl Timeline { return Ok(None); }; - let rtc = self - .remote_client - .as_ref() - .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?; + // curl has this by default + let timeout = std::time::Duration::from_secs(120); - match local_layer.evict_and_wait(rtc).await { + match local_layer.evict_and_wait(timeout).await { Ok(()) => Ok(Some(true)), Err(EvictionError::NotFound) => Ok(Some(false)), Err(EvictionError::Downloaded) => Ok(Some(false)), + Err(EvictionError::Timeout) => Ok(Some(false)), + } + } + + fn should_roll( + &self, + layer_size: u64, + projected_layer_size: u64, + checkpoint_distance: u64, + projected_lsn: Lsn, + last_freeze_at: Lsn, + opened_at: Instant, + ) -> bool { + let distance = projected_lsn.widening_sub(last_freeze_at); + + // Rolling the open layer can be triggered by: + // 1. The distance from the last LSN we rolled at. This bounds the amount of WAL that + // the safekeepers need to store. For sharded tenants, we multiply by shard count to + // account for how writes are distributed across shards: we expect each node to consume + // 1/count of the LSN on average. + // 2. The size of the currently open layer. + // 3. The time since the last roll. It helps safekeepers to regard pageserver as caught + // up and suspend activity. + if distance >= checkpoint_distance as i128 * self.shard_identity.count.count() as i128 { + info!( + "Will roll layer at {} with layer size {} due to LSN distance ({})", + projected_lsn, layer_size, distance + ); + + true + } else if projected_layer_size >= checkpoint_distance { + info!( + "Will roll layer at {} with layer size {} due to layer size ({})", + projected_lsn, layer_size, projected_layer_size + ); + + true + } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { + info!( + "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", + projected_lsn, + layer_size, + opened_at.elapsed() + ); + + true + } else { + false } } } @@ -1152,44 +2117,94 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { - fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + pub(crate) fn get_lsn_lease_length(&self) -> Duration { + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf + .lsn_lease_length + .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) + } + + // TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072 + #[allow(unused)] + pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .lsn_lease_length_for_ts + .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts) + } + + pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .switch_aux_file_policy + .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy) + } + + pub(crate) fn get_lazy_slru_download(&self) -> bool { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .lazy_slru_download + .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download) + } + + fn get_checkpoint_distance(&self) -> u64 { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } - fn get_eviction_policy(&self) -> EvictionPolicy { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf; + fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings { + let tenant_conf = &self.tenant_conf.load(); tenant_conf + .tenant_conf + .compaction_algorithm + .as_ref() + .unwrap_or(&self.conf.default_tenant_conf.compaction_algorithm) + .clone() + } + + fn get_eviction_policy(&self) -> EvictionPolicy { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf .eviction_policy .unwrap_or(self.conf.default_tenant_conf.eviction_policy) } @@ -1203,21 +2218,26 @@ impl Timeline { .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold) } - fn get_gc_feedback(&self) -> bool { - let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf; + fn get_image_layer_creation_check_threshold(&self) -> u8 { + let tenant_conf = self.tenant_conf.load(); tenant_conf - .gc_feedback - .unwrap_or(self.conf.default_tenant_conf.gc_feedback) + .tenant_conf + .image_layer_creation_check_threshold + .unwrap_or( + self.conf + .default_tenant_conf + .image_layer_creation_check_threshold, + ) } - pub(super) fn tenant_conf_updated(&self) { + pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - &self.tenant_conf.read().unwrap().tenant_conf, + new_conf, &self.conf.default_tenant_conf, ); @@ -1244,35 +2264,45 @@ impl Timeline { #[allow(clippy::too_many_arguments)] pub(super) fn new( conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, tenant_shard_id: TenantShardId, generation: Generation, shard_identity: ShardIdentity, - walredo_mgr: Arc, + walredo_mgr: Option>, resources: TimelineResources, pg_version: u32, state: TimelineState, + aux_file_policy: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(state); - let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); + let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn)); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); - let tenant_conf_guard = tenant_conf.read().unwrap(); - - let evictions_low_residence_duration_metric_threshold = + let evictions_low_residence_duration_metric_threshold = { + let loaded_tenant_conf = tenant_conf.load(); Self::get_evictions_low_residence_duration_metric_threshold( - &tenant_conf_guard.tenant_conf, + &loaded_tenant_conf.tenant_conf, &conf.default_tenant_conf, - ); - drop(tenant_conf_guard); + ) + }; Arc::new_cyclic(|myself| { + let metrics = TimelineMetrics::new( + &tenant_shard_id, + &timeline_id, + crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( + "mtime", + evictions_low_residence_duration_metric_threshold, + ), + ); + let aux_file_metrics = metrics.aux_file_size_gauge.clone(); + let mut result = Timeline { conf, tenant_conf, @@ -1282,13 +2312,12 @@ impl Timeline { generation, shard_identity, pg_version, - layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())), - wanted_image_layers: Mutex::new(None), + layers: Default::default(), walredo_mgr, walreceiver: Mutex::new(None), - remote_client: resources.remote_client.map(Arc::new), + remote_client: Arc::new(resources.remote_client), // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'. last_record_lsn: SeqWait::new(RecordLsn { @@ -1305,27 +2334,23 @@ impl Timeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - metrics: TimelineMetrics::new( + metrics, + + query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new( &tenant_shard_id, &timeline_id, - crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( - "mtime", - evictions_low_residence_duration_metric_threshold, - ), ), + directory_metrics: array::from_fn(|_| AtomicU64::new(0)), + flush_loop_state: Mutex::new(FlushLoopState::NotStarted), layer_flush_start_tx, layer_flush_done_tx, - write_lock: tokio::sync::Mutex::new(()), + write_lock: tokio::sync::Mutex::new(None), - gc_info: std::sync::RwLock::new(GcInfo { - retain_lsns: Vec::new(), - horizon_cutoff: Lsn(0), - pitr_cutoff: Lsn(0), - }), + gc_info: std::sync::RwLock::new(GcInfo::default()), latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -1339,11 +2364,18 @@ impl Timeline { // initial logical size is 0. LogicalSize::empty_initial() }, - partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))), + partitioning: tokio::sync::Mutex::new(( + (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()), + Lsn(0), + )), repartition_threshold: 0, + last_image_layer_creation_check_at: AtomicLsn::new(0), last_received_wal: Mutex::new(None), - rel_size_cache: RwLock::new(HashMap::new()), + rel_size_cache: RwLock::new(RelSizeCache { + complete_as_of: disk_consistent_lsn, + map: HashMap::new(), + }), download_all_remote_layers_task_info: RwLock::new(None), @@ -1355,13 +2387,30 @@ impl Timeline { delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())), cancel, - gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")), + gate: Gate::default(), compaction_lock: tokio::sync::Mutex::default(), gc_lock: tokio::sync::Mutex::default(), + + standby_horizon: AtomicLsn::new(0), + + timeline_get_throttle: resources.timeline_get_throttle, + + aux_files: tokio::sync::Mutex::new(AuxFilesState { + dir: None, + n_deltas: 0, + }), + + aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), + + last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy), + + #[cfg(test)] + extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; + result .metrics .last_record_gauge @@ -1416,7 +2465,7 @@ impl Timeline { let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); - assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..})); + assert!(matches!(*flush_loop_state, FlushLoopState::Running{..})); *flush_loop_state = FlushLoopState::Exited; Ok(()) } @@ -1438,20 +2487,19 @@ impl Timeline { self.timeline_id, self.tenant_shard_id ); - let tenant_conf_guard = self.tenant_conf.read().unwrap(); - let wal_connect_timeout = tenant_conf_guard + let tenant_conf = self.tenant_conf.load(); + let wal_connect_timeout = tenant_conf .tenant_conf .walreceiver_connect_timeout .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); - let lagging_wal_timeout = tenant_conf_guard + let lagging_wal_timeout = tenant_conf .tenant_conf .lagging_wal_timeout .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); - let max_lsn_wal_lag = tenant_conf_guard + let max_lsn_wal_lag = tenant_conf .tenant_conf .max_lsn_wal_lag .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); - drop(tenant_conf_guard); let mut guard = self.walreceiver.lock().unwrap(); assert!( @@ -1489,13 +2537,13 @@ impl Timeline { index_part: Option, ) -> anyhow::Result<()> { use init::{Decision::*, Discovered, DismissedLayer}; - use LayerFileName::*; + use LayerName::*; let mut guard = self.layers.write().await; let timer = self.metrics.load_layer_map_histo.start_timer(); - // Scan timeline directory and create ImageFileName and DeltaFilename + // Scan timeline directory and create ImageLayerName and DeltaFilename // structs representing all files on disk let timeline_path = self .conf @@ -1504,8 +2552,6 @@ impl Timeline { let span = tracing::Span::current(); // Copy to move into the task we're about to spawn - let generation = self.generation; - let shard = self.get_shard_index(); let this = self.myself.upgrade().expect("&self method holds the arc"); let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({ @@ -1519,11 +2565,14 @@ impl Timeline { for discovered in discovered { let (name, kind) = match discovered { - Discovered::Layer(file_name, file_size) => { - discovered_layers.push((file_name, file_size)); + Discovered::Layer(layer_file_name, local_metadata) => { + discovered_layers.push((layer_file_name, local_metadata)); continue; } - Discovered::Metadata | Discovered::IgnoredBackup => { + Discovered::IgnoredBackup(path) => { + std::fs::remove_file(path) + .or_else(fs_ext::ignore_not_found) + .fatal_err("Removing .old file"); continue; } Discovered::Unknown(file_name) => { @@ -1549,13 +2598,8 @@ impl Timeline { ); } - let decided = init::reconcile( - discovered_layers, - index_part.as_ref(), - disk_consistent_lsn, - generation, - shard, - ); + let decided = + init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn); let mut loaded_layers = Vec::new(); let mut needs_cleanup = Vec::new(); @@ -1563,34 +2607,25 @@ impl Timeline { for (name, decision) in decided { let decision = match decision { - Ok(UseRemote { local, remote }) => { - // Remote is authoritative, but we may still choose to retain - // the local file if the contents appear to match - if local.file_size() == remote.file_size() { - // Use the local file, but take the remote metadata so that we pick up - // the correct generation. - UseLocal(remote) - } else { - path.push(name.file_name()); - init::cleanup_local_file_for_remote(&path, &local, &remote)?; - path.pop(); - UseRemote { local, remote } - } - } Ok(decision) => decision, Err(DismissedLayer::Future { local }) => { - if local.is_some() { - path.push(name.file_name()); - init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?; - path.pop(); + if let Some(local) = local { + init::cleanup_future_layer( + &local.local_path, + &name, + disk_consistent_lsn, + )?; } needs_cleanup.push(name); continue; } Err(DismissedLayer::LocalOnly(local)) => { - path.push(name.file_name()); - init::cleanup_local_only_file(&path, &name, &local)?; - path.pop(); + init::cleanup_local_only_file(&name, &local)?; + // this file never existed remotely, we will have to do rework + continue; + } + Err(DismissedLayer::BadMetadata(local)) => { + init::cleanup_local_file_for_remote(&local)?; // this file never existed remotely, we will have to do rework continue; } @@ -1604,13 +2639,12 @@ impl Timeline { tracing::debug!(layer=%name, ?decision, "applied"); let layer = match decision { - UseLocal(m) => { - total_physical_size += m.file_size(); - Layer::for_resident(conf, &this, name, m).drop_eviction_guard() - } - Evicted(remote) | UseRemote { remote, .. } => { - Layer::for_evicted(conf, &this, name, remote) + Resident { local, remote } => { + total_physical_size += local.file_size; + Layer::for_resident(conf, &this, local.local_path, name, remote) + .drop_eviction_guard() } + Evicted(remote) => Layer::for_evicted(conf, &this, name, remote), }; loaded_layers.push(layer); @@ -1626,36 +2660,36 @@ impl Timeline { guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1); - if let Some(rtc) = self.remote_client.as_ref() { - rtc.schedule_layer_file_deletion(&needs_cleanup)?; - rtc.schedule_index_upload_for_file_changes()?; - // This barrier orders above DELETEs before any later operations. - // This is critical because code executing after the barrier might - // create again objects with the same key that we just scheduled for deletion. - // For example, if we just scheduled deletion of an image layer "from the future", - // later compaction might run again and re-create the same image layer. - // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn. - // "same" here means same key range and LSN. - // - // Without a barrier between above DELETEs and the re-creation's PUTs, - // the upload queue may execute the PUT first, then the DELETE. - // In our example, we will end up with an IndexPart referencing a non-existent object. - // - // 1. a future image layer is created and uploaded - // 2. ps restart - // 3. the future layer from (1) is deleted during load layer map - // 4. image layer is re-created and uploaded - // 5. deletion queue would like to delete (1) but actually deletes (4) - // 6. delete by name works as expected, but it now deletes the wrong (later) version - // - // See https://github.com/neondatabase/neon/issues/5878 - // - // NB: generation numbers naturally protect against this because they disambiguate - // (1) and (4) - rtc.schedule_barrier()?; - // Tenant::create_timeline will wait for these uploads to happen before returning, or - // on retry. - } + self.remote_client + .schedule_layer_file_deletion(&needs_cleanup)?; + self.remote_client + .schedule_index_upload_for_file_changes()?; + // This barrier orders above DELETEs before any later operations. + // This is critical because code executing after the barrier might + // create again objects with the same key that we just scheduled for deletion. + // For example, if we just scheduled deletion of an image layer "from the future", + // later compaction might run again and re-create the same image layer. + // "from the future" here means an image layer whose LSN is > IndexPart::disk_consistent_lsn. + // "same" here means same key range and LSN. + // + // Without a barrier between above DELETEs and the re-creation's PUTs, + // the upload queue may execute the PUT first, then the DELETE. + // In our example, we will end up with an IndexPart referencing a non-existent object. + // + // 1. a future image layer is created and uploaded + // 2. ps restart + // 3. the future layer from (1) is deleted during load layer map + // 4. image layer is re-created and uploaded + // 5. deletion queue would like to delete (1) but actually deletes (4) + // 6. delete by name works as expected, but it now deletes the wrong (later) version + // + // See https://github.com/neondatabase/neon/issues/5878 + // + // NB: generation numbers naturally protect against this because they disambiguate + // (1) and (4) + self.remote_client.schedule_barrier()?; + // Tenant::create_timeline will wait for these uploads to happen before returning, or + // on retry. info!( "loaded layer map with {} layers at {}, total physical size: {}", @@ -1677,6 +2711,12 @@ impl Timeline { priority: GetLogicalSizePriority, ctx: &RequestContext, ) -> logical_size::CurrentLogicalSize { + if !self.tenant_shard_id.is_shard_zero() { + // Logical size is only accurately maintained on shard zero: when called elsewhere, for example + // when HTTP API is serving a GET for timeline zero, return zero + return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero()); + } + let current_size = self.current_logical_size.current_size(); debug!("Current size: {current_size:?}"); @@ -1708,6 +2748,7 @@ impl Timeline { // Don't make noise. } else { warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work"); + debug_assert!(false); } } }; @@ -1777,7 +2818,7 @@ impl Timeline { .await; Ok(()) } - .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)), + .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)), ); } @@ -1793,11 +2834,6 @@ impl Timeline { self.current_logical_size.initialized.add_permits(1); } - enum BackgroundCalculationError { - Cancelled, - Other(anyhow::Error), - } - let try_once = |attempt: usize| { let background_ctx = &background_ctx; let self_ref = &self; @@ -1815,10 +2851,10 @@ impl Timeline { (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit) } _ = self_ref.cancel.cancelled() => { - return Err(BackgroundCalculationError::Cancelled); + return Err(CalculateLogicalSizeError::Cancelled); } _ = cancel.cancelled() => { - return Err(BackgroundCalculationError::Cancelled); + return Err(CalculateLogicalSizeError::Cancelled); }, () = skip_concurrency_limiter.cancelled() => { // Some action that is part of a end user interaction requested logical size @@ -1836,28 +2872,21 @@ impl Timeline { crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances) }; - match self_ref + let calculated_size = self_ref .logical_size_calculation_task( initial_part_end, LogicalSizeCalculationCause::Initial, background_ctx, ) - .await - { - Ok(calculated_size) => Ok((calculated_size, metrics_guard)), - Err(CalculateLogicalSizeError::Cancelled) => { - Err(BackgroundCalculationError::Cancelled) - } - Err(CalculateLogicalSizeError::Other(err)) => { - if let Some(PageReconstructError::AncestorStopping(_)) = - err.root_cause().downcast_ref() - { - Err(BackgroundCalculationError::Cancelled) - } else { - Err(BackgroundCalculationError::Other(err)) - } - } - } + .await?; + + self_ref + .trigger_aux_file_size_computation(initial_part_end, background_ctx) + .await?; + + // TODO: add aux file size to logical size + + Ok((calculated_size, metrics_guard)) } }; @@ -1868,8 +2897,11 @@ impl Timeline { match try_once(attempt).await { Ok(res) => return ControlFlow::Continue(res), - Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()), - Err(BackgroundCalculationError::Other(e)) => { + Err(CalculateLogicalSizeError::Cancelled) => return ControlFlow::Break(()), + Err( + e @ (CalculateLogicalSizeError::Decode(_) + | CalculateLogicalSizeError::PageRead(_)), + ) => { warn!(attempt, "initial size calculation failed: {e:?}"); // exponential back-off doesn't make sense at these long intervals; // use fixed retry interval with generous jitter instead @@ -1919,7 +2951,7 @@ impl Timeline { .expect("only this task sets it"); } - pub fn spawn_ondemand_logical_size_calculation( + pub(crate) fn spawn_ondemand_logical_size_calculation( self: &Arc, lsn: Lsn, cause: LogicalSizeCalculationCause, @@ -1964,16 +2996,22 @@ impl Timeline { cause: LogicalSizeCalculationCause, ctx: &RequestContext, ) -> Result { - span::debug_assert_current_span_has_tenant_and_timeline_id(); + crate::span::debug_assert_current_span_has_tenant_and_timeline_id(); + // We should never be calculating logical sizes on shard !=0, because these shards do not have + // accurate relation sizes, and they do not emit consumption metrics. + debug_assert!(self.tenant_shard_id.is_shard_zero()); - let _guard = self.gate.enter(); + let guard = self + .gate + .enter() + .map_err(|_| CalculateLogicalSizeError::Cancelled)?; let self_calculation = Arc::clone(self); let mut calculation = pin!(async { let ctx = ctx.attached_child(); self_calculation - .calculate_logical_size(lsn, cause, &ctx) + .calculate_logical_size(lsn, cause, &guard, &ctx) .await }); @@ -1983,10 +3021,6 @@ impl Timeline { debug!("cancelling logical size calculation for timeline shutdown"); calculation.await } - _ = task_mgr::shutdown_watcher() => { - debug!("cancelling logical size calculation for task shutdown"); - calculation.await - } } } @@ -1998,37 +3032,20 @@ impl Timeline { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn calculate_logical_size( + async fn calculate_logical_size( &self, up_to_lsn: Lsn, cause: LogicalSizeCalculationCause, + _guard: &GateGuard, ctx: &RequestContext, ) -> Result { info!( "Calculating logical size for timeline {} at {}", self.timeline_id, up_to_lsn ); - // These failpoints are used by python tests to ensure that we don't delete - // the timeline while the logical size computation is ongoing. - // The first failpoint is used to make this function pause. - // Then the python test initiates timeline delete operation in a thread. - // It waits for a few seconds, then arms the second failpoint and disables - // the first failpoint. The second failpoint prints an error if the timeline - // delete code has deleted the on-disk state while we're still running here. - // It shouldn't do that. If it does it anyway, the error will be caught - // by the test suite, highlighting the problem. - fail::fail_point!("timeline-calculate-logical-size-pause"); - fail::fail_point!("timeline-calculate-logical-size-check-dir-exists", |_| { - if !self - .conf - .metadata_path(&self.tenant_shard_id, &self.timeline_id) - .exists() - { - error!("timeline-calculate-logical-size-pre metadata file does not exist") - } - // need to return something - Ok(0) - }); + + pausable_failpoint!("timeline-calculate-logical-size-pause"); + // See if we've already done the work for initial size calculation. // This is a short-cut for timelines that are mostly unused. if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) { @@ -2073,11 +3090,34 @@ impl Timeline { } } - async fn find_layer(&self, layer_file_name: &str) -> Option { + pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) { + self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed); + let aux_metric = + self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed); + + let sum_of_entries = self + .directory_metrics + .iter() + .map(|v| v.load(AtomicOrdering::Relaxed)) + .sum(); + // Set a high general threshold and a lower threshold for the auxiliary files, + // as we can have large numbers of relations in the db directory. + const SUM_THRESHOLD: u64 = 5000; + const AUX_THRESHOLD: u64 = 1000; + if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD { + self.metrics + .directory_entries_count_gauge + .set(sum_of_entries); + } else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) { + metric.set(sum_of_entries); + } + } + + async fn find_layer(&self, layer_name: &LayerName) -> Option { let guard = self.layers.read().await; for historic_layer in guard.layer_map().iter_historic_layers() { - let historic_layer_name = historic_layer.filename().file_name(); - if layer_file_name == historic_layer_name { + let historic_layer_name = historic_layer.layer_name(); + if layer_name == &historic_layer_name { return Some(guard.get_from_desc(&historic_layer)); } } @@ -2093,49 +3133,38 @@ impl Timeline { /// should treat this as a cue to simply skip doing any heatmap uploading /// for this timeline. pub(crate) async fn generate_heatmap(&self) -> Option { - let eviction_info = self.get_local_layers_for_disk_usage_eviction().await; + if !self.is_active() { + return None; + } - let remote_client = match &self.remote_client { - Some(c) => c, - None => return None, - }; + let guard = self.layers.read().await; - let layer_file_names = eviction_info - .resident_layers - .iter() - .map(|l| l.layer.layer_desc().filename()) - .collect::>(); + let resident = guard.likely_resident_layers().map(|layer| { + let last_activity_ts = layer.access_stats().latest_activity_or_now(); - let decorated = match remote_client.get_layers_metadata(layer_file_names) { - Ok(d) => d, - Err(_) => { - // Getting metadata only fails on Timeline in bad state. - return None; - } - }; - - let heatmap_layers = std::iter::zip( - eviction_info.resident_layers.into_iter(), - decorated.into_iter(), - ) - .filter_map(|(layer, remote_info)| { - remote_info.map(|remote_info| { - HeatMapLayer::new( - layer.layer.layer_desc().filename(), - IndexLayerMetadata::from(remote_info), - layer.last_activity_ts, - ) - }) + HeatMapLayer::new( + layer.layer_desc().layer_name(), + layer.metadata(), + last_activity_ts, + ) }); - Some(HeatMapTimeline::new( - self.timeline_id, - heatmap_layers.collect(), - )) + let layers = resident.collect(); + + Some(HeatMapTimeline::new(self.timeline_id, layers)) + } + + /// Returns true if the given lsn is or was an ancestor branchpoint. + pub(crate) fn is_ancestor_lsn(&self, lsn: Lsn) -> bool { + // upon timeline detach, we set the ancestor_lsn to Lsn::INVALID and the store the original + // branchpoint in the value in IndexPart::lineage + self.ancestor_lsn == lsn + || (self.ancestor_lsn == Lsn::INVALID + && self.remote_client.is_previous_ancestor_lsn(lsn)) } } -type TraversalId = String; +type TraversalId = Arc; trait TraversalLayerExt { fn traversal_id(&self) -> TraversalId; @@ -2143,13 +3172,13 @@ trait TraversalLayerExt { impl TraversalLayerExt for Layer { fn traversal_id(&self) -> TraversalId { - self.local_path().to_string() + Arc::clone(self.debug_str()) } } impl TraversalLayerExt for Arc { fn traversal_id(&self) -> TraversalId { - format!("timeline {} in-memory {self}", self.get_timeline_id()) + Arc::clone(self.local_path_str()) } } @@ -2178,7 +3207,7 @@ impl Timeline { let mut timeline = self; let mut read_count = scopeguard::guard(0, |cnt| { - crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64) + crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64) }); // For debugging purposes, collect the path of layers that we traversed @@ -2194,7 +3223,7 @@ impl Timeline { // 'prev_lsn' tracks the last LSN that we were at in our search. It's used // to check that each iteration make some progress, to break infinite // looping if something goes wrong. - let mut prev_lsn = Lsn(u64::MAX); + let mut prev_lsn = None; let mut result = ValueReconstructResult::Continue; let mut cont_lsn = Lsn(request_lsn.0 + 1); @@ -2214,101 +3243,56 @@ impl Timeline { MATERIALIZED_PAGE_CACHE_HIT.inc_by(1); return Ok(traversal_path); } - if prev_lsn <= cont_lsn { - // Didn't make any progress in last iteration. Error out to avoid - // getting stuck in the loop. - return Err(layer_traversal_error(format!( - "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", - key, - Lsn(cont_lsn.0 - 1), - request_lsn, - timeline.ancestor_lsn - ), traversal_path)); + if let Some(prev) = prev_lsn { + if prev <= cont_lsn { + // Didn't make any progress in last iteration. Error out to avoid + // getting stuck in the loop. + return Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn: Lsn(cont_lsn.0 - 1), + request_lsn, + ancestor_lsn: Some(timeline.ancestor_lsn), + traversal_path, + backtrace: None, + })); + } } - prev_lsn = cont_lsn; + prev_lsn = Some(cont_lsn); } ValueReconstructResult::Missing => { - return Err(layer_traversal_error( - if cfg!(test) { - format!( - "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}", - key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(), - ) - } else { - format!( - "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}", - key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn - ) - }, + return Err(PageReconstructError::MissingKey(MissingKeyError { + key, + shard: self.shard_identity.get_shard_number(&key), + cont_lsn, + request_lsn, + ancestor_lsn: None, traversal_path, - )); + backtrace: if cfg!(test) { + Some(std::backtrace::Backtrace::force_capture()) + } else { + None + }, + })); } } // Recurse into ancestor if needed - if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( - "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn - ); - let ancestor = match timeline.get_ancestor_timeline() { - Ok(timeline) => timeline, - Err(e) => return Err(PageReconstructError::from(e)), - }; + if let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() { + if key.is_inherited_key() && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + trace!( + "going into ancestor {}, cont_lsn is {}", + timeline.ancestor_lsn, + cont_lsn + ); - // It's possible that the ancestor timeline isn't active yet, or - // is active but hasn't yet caught up to the branch point. Wait - // for it. - // - // This cannot happen while the pageserver is running normally, - // because you cannot create a branch from a point that isn't - // present in the pageserver yet. However, we don't wait for the - // branch point to be uploaded to cloud storage before creating - // a branch. I.e., the branch LSN need not be remote consistent - // for the branching operation to succeed. - // - // Hence, if we try to load a tenant in such a state where - // 1. the existence of the branch was persisted (in IndexPart and/or locally) - // 2. but the ancestor state is behind branch_lsn because it was not yet persisted - // then we will need to wait for the ancestor timeline to - // re-stream WAL up to branch_lsn before we access it. - // - // How can a tenant get in such a state? - // - ungraceful pageserver process exit - // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 - // - // NB: this could be avoided by requiring - // branch_lsn >= remote_consistent_lsn - // during branch creation. - match ancestor.wait_to_become_active(ctx).await { - Ok(()) => {} - Err(TimelineState::Stopping) => { - return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id)); - } - Err(state) => { - return Err(PageReconstructError::Other(anyhow::anyhow!( - "Timeline {} will not become active. Current state: {:?}", - ancestor.timeline_id, - &state, - ))); - } + timeline_owned = timeline + .get_ready_ancestor_timeline(ancestor_timeline, ctx) + .await?; + timeline = &*timeline_owned; + prev_lsn = None; + continue 'outer; } - ancestor - .wait_lsn(timeline.ancestor_lsn, ctx) - .await - .map_err(|e| match e { - e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e), - WaitLsnError::Shutdown => PageReconstructError::Cancelled, - e @ WaitLsnError::BadState => { - PageReconstructError::Other(anyhow::anyhow!(e)) - } - })?; - - timeline_owned = ancestor; - timeline = &*timeline_owned; - prev_lsn = Lsn(u64::MAX); - continue 'outer; } let guard = timeline.layers.read().await; @@ -2319,10 +3303,14 @@ impl Timeline { if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.layer_name().display()); // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, start_lsn); + + let open_layer = open_layer.clone(); + drop(guard); + result = match open_layer .get_value_reconstruct_data( key, @@ -2336,23 +3324,20 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let open_layer = Arc::clone(open_layer); - move || open_layer.traversal_id() - }), - )); + *read_count += 1; + traversal_path.push((result, cont_lsn, open_layer.traversal_id())); continue 'outer; } } for frozen_layer in layers.frozen_layers.iter().rev() { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.layer_name().display()); let lsn_floor = max(cached_lsn + 1, start_lsn); + + let frozen_layer = frozen_layer.clone(); + drop(guard); + result = match frozen_layer .get_value_reconstruct_data( key, @@ -2366,21 +3351,15 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let frozen_layer = Arc::clone(frozen_layer); - move || frozen_layer.traversal_id() - }), - )); + *read_count += 1; + traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); continue 'outer; } } if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) { let layer = guard.get_from_desc(&layer); + drop(guard); // Get all the data needed to reconstruct the page version from this layer. // But if we have an older cached page image, no need to go past that. let lsn_floor = max(cached_lsn + 1, lsn_floor); @@ -2393,14 +3372,7 @@ impl Timeline { }; cont_lsn = lsn_floor; *read_count += 1; - traversal_path.push(( - result, - cont_lsn, - Box::new({ - let layer = layer.to_owned(); - move || layer.traversal_id() - }), - )); + traversal_path.push((result, cont_lsn, layer.traversal_id())); continue 'outer; } else if timeline.ancestor_timeline.is_some() { // Nothing on this timeline. Traverse to parent @@ -2415,6 +3387,233 @@ impl Timeline { } } + /// Get the data needed to reconstruct all keys in the provided keyspace + /// + /// The algorithm is as follows: + /// 1. While some keys are still not done and there's a timeline to visit: + /// 2. Visit the timeline (see [`Timeline::get_vectored_reconstruct_data_timeline`]: + /// 2.1: Build the fringe for the current keyspace + /// 2.2 Visit the newest layer from the fringe to collect all values for the range it + /// intersects + /// 2.3. Pop the timeline from the fringe + /// 2.4. If the fringe is empty, go back to 1 + async fn get_vectored_reconstruct_data( + &self, + mut keyspace: KeySpace, + request_lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> Result<(), GetVectoredError> { + let mut timeline_owned: Arc; + let mut timeline = self; + + let mut cont_lsn = Lsn(request_lsn.0 + 1); + + let missing_keyspace = loop { + if self.cancel.is_cancelled() { + return Err(GetVectoredError::Cancelled); + } + + let TimelineVisitOutcome { + completed_keyspace: completed, + image_covered_keyspace, + } = Self::get_vectored_reconstruct_data_timeline( + timeline, + keyspace.clone(), + cont_lsn, + reconstruct_state, + &self.cancel, + ctx, + ) + .await?; + + keyspace.remove_overlapping_with(&completed); + + // Do not descend into the ancestor timeline for aux files. + // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid + // stalling compaction. + keyspace.remove_overlapping_with(&KeySpace { + ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE], + }); + + // Keyspace is fully retrieved + if keyspace.is_empty() { + break None; + } + + let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else { + // Not fully retrieved but no ancestor timeline. + break Some(keyspace); + }; + + // Now we see if there are keys covered by the image layer but does not exist in the + // image layer, which means that the key does not exist. + + // The block below will stop the vectored search if any of the keys encountered an image layer + // which did not contain a snapshot for said key. Since we have already removed all completed + // keys from `keyspace`, we expect there to be no overlap between it and the image covered key + // space. If that's not the case, we had at least one key encounter a gap in the image layer + // and stop the search as a result of that. + let removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + if !removed.is_empty() { + break Some(removed); + } + // If we reached this point, `remove_overlapping_with` should not have made any change to the + // keyspace. + + // Take the min to avoid reconstructing a page with data newer than request Lsn. + cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); + timeline_owned = timeline + .get_ready_ancestor_timeline(ancestor_timeline, ctx) + .await + .map_err(GetVectoredError::GetReadyAncestorError)?; + timeline = &*timeline_owned; + }; + + if let Some(missing_keyspace) = missing_keyspace { + return Err(GetVectoredError::MissingKey(MissingKeyError { + key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ + shard: self + .shard_identity + .get_shard_number(&missing_keyspace.start().unwrap()), + cont_lsn, + request_lsn, + ancestor_lsn: Some(timeline.ancestor_lsn), + traversal_path: vec![], + backtrace: None, + })); + } + + Ok(()) + } + + /// Collect the reconstruct data for a keyspace from the specified timeline. + /// + /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect + /// the current keyspace. The current keyspace of the search at any given timeline + /// is the original keyspace minus all the keys that have been completed minus + /// any keys for which we couldn't find an intersecting layer. It's not tracked explicitly, + /// but if you merge all the keyspaces in the fringe, you get the "current keyspace". + /// + /// This is basically a depth-first search visitor implementation where a vertex + /// is the (layer, lsn range, key space) tuple. The fringe acts as the stack. + /// + /// At each iteration pop the top of the fringe (the layer with the highest Lsn) + /// and get all the required reconstruct data from the layer in one go. + /// + /// Returns the completed keyspace and the keyspaces with image coverage. The caller + /// decides how to deal with these two keyspaces. + async fn get_vectored_reconstruct_data_timeline( + timeline: &Timeline, + keyspace: KeySpace, + mut cont_lsn: Lsn, + reconstruct_state: &mut ValuesReconstructState, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result { + let mut unmapped_keyspace = keyspace.clone(); + let mut fringe = LayerFringe::new(); + + let mut completed_keyspace = KeySpace::default(); + let mut image_covered_keyspace = KeySpaceRandomAccum::new(); + + loop { + if cancel.is_cancelled() { + return Err(GetVectoredError::Cancelled); + } + + let (keys_done_last_step, keys_with_image_coverage) = + reconstruct_state.consume_done_keys(); + unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); + completed_keyspace.merge(&keys_done_last_step); + if let Some(keys_with_image_coverage) = keys_with_image_coverage { + unmapped_keyspace + .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone())); + image_covered_keyspace.add_range(keys_with_image_coverage); + } + + // Do not descent any further if the last layer we visited + // completed all keys in the keyspace it inspected. This is not + // required for correctness, but avoids visiting extra layers + // which turns out to be a perf bottleneck in some cases. + if !unmapped_keyspace.is_empty() { + let guard = timeline.layers.read().await; + let layers = guard.layer_map(); + + let in_memory_layer = layers.find_in_memory_layer(|l| { + let start_lsn = l.get_lsn_range().start; + cont_lsn > start_lsn + }); + + match in_memory_layer { + Some(l) => { + let lsn_range = l.get_lsn_range().start..cont_lsn; + fringe.update( + ReadableLayer::InMemoryLayer(l), + unmapped_keyspace.clone(), + lsn_range, + ); + } + None => { + for range in unmapped_keyspace.ranges.iter() { + let results = layers.range_search(range.clone(), cont_lsn); + + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), + keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, + ) + }) + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); + } + } + } + + // It's safe to drop the layer map lock after planning the next round of reads. + // The fringe keeps readable handles for the layers which are safe to read even + // if layers were compacted or flushed. + // + // The more interesting consideration is: "Why is the read algorithm still correct + // if the layer map changes while it is operating?". Doing a vectored read on a + // timeline boils down to pushing an imaginary lsn boundary downwards for each range + // covered by the read. The layer map tells us how to move the lsn downwards for a + // range at *a particular point in time*. It is fine for the answer to be different + // at two different time points. + drop(guard); + } + + if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { + let next_cont_lsn = lsn_range.start; + layer_to_read + .get_values_reconstruct_data( + keyspace_to_read.clone(), + lsn_range, + reconstruct_state, + ctx, + ) + .await?; + + unmapped_keyspace = keyspace_to_read; + cont_lsn = next_cont_lsn; + + reconstruct_state.on_layer_visited(&layer_to_read); + } else { + break; + } + } + + Ok(TimelineVisitOutcome { + completed_keyspace, + image_covered_keyspace: image_covered_keyspace.consume_keyspace(), + }) + } + /// # Cancel-safety /// /// This method is cancellation-safe. @@ -2435,15 +3634,65 @@ impl Timeline { Some((lsn, img)) } - fn get_ancestor_timeline(&self) -> anyhow::Result> { - let ancestor = self.ancestor_timeline.as_ref().with_context(|| { - format!( - "Ancestor is missing. Timeline id: {} Ancestor id {:?}", - self.timeline_id, - self.get_ancestor_timeline_id(), - ) - })?; - Ok(Arc::clone(ancestor)) + async fn get_ready_ancestor_timeline( + &self, + ancestor: &Arc, + ctx: &RequestContext, + ) -> Result, GetReadyAncestorError> { + // It's possible that the ancestor timeline isn't active yet, or + // is active but hasn't yet caught up to the branch point. Wait + // for it. + // + // This cannot happen while the pageserver is running normally, + // because you cannot create a branch from a point that isn't + // present in the pageserver yet. However, we don't wait for the + // branch point to be uploaded to cloud storage before creating + // a branch. I.e., the branch LSN need not be remote consistent + // for the branching operation to succeed. + // + // Hence, if we try to load a tenant in such a state where + // 1. the existence of the branch was persisted (in IndexPart and/or locally) + // 2. but the ancestor state is behind branch_lsn because it was not yet persisted + // then we will need to wait for the ancestor timeline to + // re-stream WAL up to branch_lsn before we access it. + // + // How can a tenant get in such a state? + // - ungraceful pageserver process exit + // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219 + // + // NB: this could be avoided by requiring + // branch_lsn >= remote_consistent_lsn + // during branch creation. + match ancestor.wait_to_become_active(ctx).await { + Ok(()) => {} + Err(TimelineState::Stopping) => { + // If an ancestor is stopping, it means the tenant is stopping: handle this the same as if this timeline was stopping. + return Err(GetReadyAncestorError::Cancelled); + } + Err(state) => { + return Err(GetReadyAncestorError::BadState { + timeline_id: ancestor.timeline_id, + state, + }); + } + } + ancestor + .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx) + .await + .map_err(|e| match e { + e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), + WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled, + WaitLsnError::BadState(state) => GetReadyAncestorError::BadState { + timeline_id: ancestor.timeline_id, + state, + }, + })?; + + Ok(ancestor.clone()) + } + + pub(crate) fn get_ancestor_timeline(&self) -> Option> { + self.ancestor_timeline.clone() } pub(crate) fn get_shard_identity(&self) -> &ShardIdentity { @@ -2453,7 +3702,11 @@ impl Timeline { /// /// Get a handle to the latest layer for appending. /// - async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result> { + async fn get_layer_for_write( + &self, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result> { let mut guard = self.layers.write().await; let layer = guard .get_layer_for_write( @@ -2462,48 +3715,12 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, + ctx, ) .await?; Ok(layer) } - async fn put_value( - &self, - key: Key, - lsn: Lsn, - val: &Value, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - //info!("PUT: key {} at {}", key, lsn); - let layer = self.get_layer_for_write(lsn).await?; - layer.put_value(key, lsn, val, ctx).await?; - Ok(()) - } - - async fn put_values( - &self, - values: &HashMap>, - ctx: &RequestContext, - ) -> anyhow::Result<()> { - // Pick the first LSN in the batch to get the layer to write to. - for lsns in values.values() { - if let Some((lsn, _)) = lsns.first() { - let layer = self.get_layer_for_write(*lsn).await?; - layer.put_values(values, ctx).await?; - break; - } - } - Ok(()) - } - - async fn put_tombstones(&self, tombstones: &[(Range, Lsn)]) -> anyhow::Result<()> { - if let Some((_, lsn)) = tombstones.first() { - let layer = self.get_layer_for_write(*lsn).await?; - layer.put_tombstones(tombstones).await?; - } - Ok(()) - } - pub(crate) fn finish_write(&self, new_lsn: Lsn) { assert!(new_lsn.is_aligned()); @@ -2511,43 +3728,44 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } - async fn freeze_inmem_layer(&self, write_lock_held: bool) { - // Freeze the current open in-memory layer. It will be written to disk on next - // iteration. - let _write_guard = if write_lock_held { - None - } else { - Some(self.write_lock.lock().await) + async fn freeze_inmem_layer_at( + &self, + at: Lsn, + write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + ) { + let frozen = { + let mut guard = self.layers.write().await; + guard + .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) + .await }; - let mut guard = self.layers.write().await; - guard - .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at) - .await; + if frozen { + let now = Instant::now(); + *(self.last_freeze_ts.write().unwrap()) = now; + } } /// Layer flusher task's main loop. async fn flush_loop( self: &Arc, - mut layer_flush_start_rx: tokio::sync::watch::Receiver, + mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>, ctx: &RequestContext, ) { info!("started flush loop"); loop { tokio::select! { _ = self.cancel.cancelled() => { - info!("shutting down layer flush task"); - break; - }, - _ = task_mgr::shutdown_watcher() => { - info!("shutting down layer flush task"); + info!("shutting down layer flush task due to Timeline::cancel"); break; }, _ = layer_flush_start_rx.changed() => {} } - trace!("waking up"); - let timer = self.metrics.flush_time_histo.start_timer(); - let flush_counter = *layer_flush_start_rx.borrow(); + let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow(); + + // The highest LSN to which we flushed in the loop over frozen layers + let mut flushed_to_lsn = Lsn(0); + let result = loop { if self.cancel.is_cancelled() { info!("dropping out of flush loop for timeline shutdown"); @@ -2557,6 +3775,8 @@ impl Timeline { return; } + let timer = self.metrics.flush_time_histo.start_timer(); + let layer_to_flush = { let guard = self.layers.read().await; guard.layer_map().frozen_layers.front().cloned() @@ -2566,31 +3786,64 @@ impl Timeline { break Ok(()); }; match self.flush_frozen_layer(layer_to_flush, ctx).await { - Ok(()) => {} + Ok(this_layer_to_lsn) => { + flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn); + } Err(FlushLayerError::Cancelled) => { info!("dropping out of flush loop for timeline shutdown"); return; } err @ Err( - FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_), + FlushLayerError::NotRunning(_) + | FlushLayerError::Other(_) + | FlushLayerError::CreateImageLayersError(_), ) => { error!("could not flush frozen layer: {err:?}"); - break err; + break err.map(|_| ()); } } + timer.stop_and_record(); }; + + // Unsharded tenants should never advance their LSN beyond the end of the + // highest layer they write: such gaps between layer data and the frozen LSN + // are only legal on sharded tenants. + debug_assert!( + self.shard_identity.count.count() > 1 + || flushed_to_lsn >= frozen_to_lsn + || !flushed_to_lsn.is_valid() + ); + + if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 { + // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised + // to us via layer_flush_start_rx, then advance it here. + // + // This path is only taken for tenants with multiple shards: single sharded tenants should + // never encounter a gap in the wal. + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"); + if self.set_disk_consistent_lsn(frozen_to_lsn) { + if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { + tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"); + } + } + } + // Notify any listeners that we're done let _ = self .layer_flush_done_tx .send_replace((flush_counter, result)); - - timer.stop_and_record(); } } - async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> { - let mut rx = self.layer_flush_done_tx.subscribe(); - + /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk. + /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`]. + /// + /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the + /// case, it means no data will be written between the top of the highest frozen layer and + /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data + /// locally for that part of the WAL. + fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result { // Increment the flush cycle counter and wake up the flush task. // Remember the new value, so that when we listen for the flush // to finish, we know when the flush that we initiated has @@ -2599,26 +3852,29 @@ impl Timeline { let flush_loop_state = { *self.flush_loop_state.lock().unwrap() }; if !matches!(flush_loop_state, FlushLoopState::Running { .. }) { - anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}") + return Err(FlushLayerError::NotRunning(flush_loop_state)); } - self.layer_flush_start_tx.send_modify(|counter| { + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { my_flush_request = *counter + 1; *counter = my_flush_request; + *lsn = std::cmp::max(at_lsn, *lsn); }); + Ok(my_flush_request) + } + + async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> { + let mut rx = self.layer_flush_done_tx.subscribe(); loop { { let (last_result_counter, last_result) = &*rx.borrow(); - if *last_result_counter >= my_flush_request { - if let Err(_err) = last_result { + if *last_result_counter >= request { + if let Err(err) = last_result { // We already logged the original error in // flush_loop. We cannot propagate it to the caller // here, because it might not be Cloneable - anyhow::bail!( - "Could not flush frozen layer. Request id: {}", - my_flush_request - ); + return Err(err.clone()); } else { return Ok(()); } @@ -2627,7 +3883,7 @@ impl Timeline { trace!("waiting for flush to complete"); tokio::select! { rx_e = rx.changed() => { - rx_e?; + rx_e.map_err(|_| FlushLayerError::NotRunning(*self.flush_loop_state.lock().unwrap()))?; }, // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring // the notification from [`flush_loop`] that it completed. @@ -2640,82 +3896,133 @@ impl Timeline { } } - fn flush_frozen_layers(&self) { - self.layer_flush_start_tx.send_modify(|val| *val += 1); + async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> { + let token = self.flush_frozen_layers(at_lsn)?; + self.wait_flush_completion(token).await } /// Flush one frozen in-memory layer to disk, as a new delta layer. - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))] + /// + /// Return value is the last lsn (inclusive) of the layer that was frozen. + #[instrument(skip_all, fields(layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, ctx: &RequestContext, - ) -> Result<(), FlushLayerError> { + ) -> Result { + debug_assert_current_span_has_tenant_and_timeline_id(); + // As a special case, when we have just imported an image into the repository, // instead of writing out a L0 delta layer, we directly write out image layer // files instead. This is possible as long as *all* the data imported into the // repository have the same LSN. let lsn_range = frozen_layer.get_lsn_range(); - let (layers_to_upload, delta_layer_to_add) = - if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) { - #[cfg(test)] - match &mut *self.flush_loop_state.lock().unwrap() { - FlushLoopState::NotStarted | FlushLoopState::Exited => { - panic!("flush loop not running") - } - FlushLoopState::Running { - initdb_optimization_count, - .. - } => { + + // Whether to directly create image layers for this flush, or flush them as delta layers + let create_image_layer = + lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1); + + #[cfg(test)] + { + match &mut *self.flush_loop_state.lock().unwrap() { + FlushLoopState::NotStarted | FlushLoopState::Exited => { + panic!("flush loop not running") + } + FlushLoopState::Running { + expect_initdb_optimization, + initdb_optimization_count, + .. + } => { + if create_image_layer { *initdb_optimization_count += 1; - } - } - // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not - // require downloading anything during initial import. - let (partitioning, _lsn) = self - .repartition( - self.initdb_lsn, - self.get_compaction_target_size(), - EnumSet::empty(), - ctx, - ) - .await?; - - if self.cancel.is_cancelled() { - return Err(FlushLayerError::Cancelled); - } - - // For image layers, we add them immediately into the layer map. - ( - self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx) - .await?, - None, - ) - } else { - #[cfg(test)] - match &mut *self.flush_loop_state.lock().unwrap() { - FlushLoopState::NotStarted | FlushLoopState::Exited => { - panic!("flush loop not running") - } - FlushLoopState::Running { - expect_initdb_optimization, - .. - } => { + } else { assert!(!*expect_initdb_optimization, "expected initdb optimization"); } } - // Normal case, write out a L0 delta layer file. - // `create_delta_layer` will not modify the layer map. - // We will remove frozen layer and add delta layer in one atomic operation later. - let layer = self.create_delta_layer(&frozen_layer, ctx).await?; - ( - // FIXME: even though we have a single image and single delta layer assumption - // we push them to vec - vec![layer.clone()], - Some(layer), + } + } + + let (layers_to_upload, delta_layer_to_add) = if create_image_layer { + // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not + // require downloading anything during initial import. + let ((rel_partition, metadata_partition), _lsn) = self + .repartition( + self.initdb_lsn, + self.get_compaction_target_size(), + EnumSet::empty(), + ctx, ) + .await + .map_err(|e| FlushLayerError::from_anyhow(self, e))?; + + if self.cancel.is_cancelled() { + return Err(FlushLayerError::Cancelled); + } + + // FIXME(auxfilesv2): support multiple metadata key partitions might need initdb support as well? + // This code path will not be hit during regression tests. After #7099 we have a single partition + // with two key ranges. If someone wants to fix initdb optimization in the future, this might need + // to be fixed. + + // For metadata, always create delta layers. + let delta_layer = if !metadata_partition.parts.is_empty() { + assert_eq!( + metadata_partition.parts.len(), + 1, + "currently sparse keyspace should only contain a single metadata keyspace" + ); + let metadata_keyspace = &metadata_partition.parts[0]; + self.create_delta_layer( + &frozen_layer, + Some( + metadata_keyspace.0.ranges.first().unwrap().start + ..metadata_keyspace.0.ranges.last().unwrap().end, + ), + ctx, + ) + .await + .map_err(|e| FlushLayerError::from_anyhow(self, e))? + } else { + None }; + // For image layers, we add them immediately into the layer map. + let mut layers_to_upload = Vec::new(); + layers_to_upload.extend( + self.create_image_layers( + &rel_partition, + self.initdb_lsn, + ImageLayerCreationMode::Initial, + ctx, + ) + .await?, + ); + + if let Some(delta_layer) = delta_layer { + layers_to_upload.push(delta_layer.clone()); + (layers_to_upload, Some(delta_layer)) + } else { + (layers_to_upload, None) + } + } else { + // Normal case, write out a L0 delta layer file. + // `create_delta_layer` will not modify the layer map. + // We will remove frozen layer and add delta layer in one atomic operation later. + let Some(layer) = self + .create_delta_layer(&frozen_layer, None, ctx) + .await + .map_err(|e| FlushLayerError::from_anyhow(self, e))? + else { + panic!("delta layer cannot be empty if no filter is applied"); + }; + ( + // FIXME: even though we have a single image and single delta layer assumption + // we push them to vec + vec![layer.clone()], + Some(layer), + ) + }; + pausable_failpoint!("flush-layer-cancel-after-writing-layer-out-pausable"); if self.cancel.is_cancelled() { @@ -2723,12 +4030,11 @@ impl Timeline { } let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); // The new on-disk layers are now in the layer map. We can remove the // in-memory layer from the map now. The flushed layer is stored in // the mapping in `create_delta_layer`. - let metadata = { + { let mut guard = self.layers.write().await; if self.cancel.is_cancelled() { @@ -2737,14 +4043,10 @@ impl Timeline { guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics); - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); - self.disk_consistent_lsn.store(disk_consistent_lsn); - + if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn - Some(self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?) - } else { - None + self.schedule_uploads(disk_consistent_lsn, layers_to_upload) + .map_err(|e| FlushLayerError::from_anyhow(self, e))?; } // release lock on 'layers' }; @@ -2759,23 +4061,22 @@ impl Timeline { // This failpoint is used by another test case `test_pageserver_recovery`. fail_point!("flush-frozen-exit"); - // Update the metadata file, with new 'disk_consistent_lsn' - // - // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing - // *all* the layers, to avoid fsyncing the file multiple times. + Ok(Lsn(lsn_range.end.0 - 1)) + } - // If we updated our disk_consistent_lsn, persist the updated metadata to local disk. - if let Some(metadata) = metadata { - save_metadata( - self.conf, - &self.tenant_shard_id, - &self.timeline_id, - &metadata, - ) - .await - .context("save_metadata")?; + /// Return true if the value changed + /// + /// This function must only be used from the layer flush task, and may not be called concurrently. + fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { + // We do a simple load/store cycle: that's why this function isn't safe for concurrent use. + let old_value = self.disk_consistent_lsn.load(); + if new_value != old_value { + assert!(new_value >= old_value); + self.disk_consistent_lsn.store(new_value); + true + } else { + false } - Ok(()) } /// Update metadata file @@ -2783,7 +4084,7 @@ impl Timeline { &self, disk_consistent_lsn: Lsn, layers_to_upload: impl IntoIterator, - ) -> anyhow::Result { + ) -> anyhow::Result<()> { // We can only save a valid 'prev_record_lsn' value on disk if we // flushed *all* in-memory changes to disk. We only track // 'prev_record_lsn' in memory for the latest processed record, so we @@ -2800,19 +4101,10 @@ impl Timeline { None }; - let ancestor_timeline_id = self - .ancestor_timeline - .as_ref() - .map(|ancestor| ancestor.timeline_id); - - let metadata = TimelineMetadata::new( + let update = crate::tenant::metadata::MetadataUpdate::new( disk_consistent_lsn, ondisk_prev_record_lsn, - ancestor_timeline_id, - self.ancestor_lsn, *self.latest_gc_cutoff_lsn.read(), - self.initdb_lsn, - self.pg_version, ); fail_point!("checkpoint-before-saving-metadata", |x| bail!( @@ -2820,89 +4112,81 @@ impl Timeline { x.unwrap() )); - if let Some(remote_client) = &self.remote_client { - for layer in layers_to_upload { - remote_client.schedule_layer_file_upload(layer)?; - } - remote_client.schedule_index_upload_for_metadata_update(&metadata)?; + for layer in layers_to_upload { + self.remote_client.schedule_layer_file_upload(layer)?; } - - Ok(metadata) - } - - async fn update_metadata_file( - &self, - disk_consistent_lsn: Lsn, - layers_to_upload: impl IntoIterator, - ) -> anyhow::Result<()> { - let metadata = self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; - - save_metadata( - self.conf, - &self.tenant_shard_id, - &self.timeline_id, - &metadata, - ) - .await - .context("save_metadata")?; + self.remote_client + .schedule_index_upload_for_metadata_update(&update)?; Ok(()) } + pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> { + self.remote_client + .preserve_initdb_archive( + &self.tenant_shard_id.tenant_id, + &self.timeline_id, + &self.cancel, + ) + .await + } + // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked // in layer map immediately. The caller is responsible to put it into the layer map. async fn create_delta_layer( self: &Arc, frozen_layer: &Arc, + key_range: Option>, ctx: &RequestContext, - ) -> anyhow::Result { - let span = tracing::info_span!("blocking"); - let new_delta: ResidentLayer = tokio::task::spawn_blocking({ - let self_clone = Arc::clone(self); - let frozen_layer = Arc::clone(frozen_layer); - let ctx = ctx.attached_child(); - move || { - // Write it out - // Keep this inside `spawn_blocking` and `Handle::current` - // as long as the write path is still sync and the read impl - // is still not fully async. Otherwise executor threads would - // be blocked. - let _g = span.entered(); - let new_delta = - Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?; - let new_delta_path = new_delta.local_path().to_owned(); - - // Sync it to disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable. - // - // NB: timeline dir must be synced _after_ the file contents are durable. - // So, two separate fsyncs are required, they mustn't be batched. - // - // TODO: If we're running inside 'flush_frozen_layers' and there are multiple - // files to flush, the fsync overhead can be reduces as follows: - // 1. write them all to temporary file names - // 2. fsync them - // 3. rename to the final name - // 4. fsync the parent directory. - // Note that (1),(2),(3) today happen inside write_to_disk(). - // - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?; - par_fsync::par_fsync(&[self_clone + ) -> anyhow::Result> { + let self_clone = Arc::clone(self); + let frozen_layer = Arc::clone(frozen_layer); + let ctx = ctx.attached_child(); + let work = async move { + let Some(new_delta) = frozen_layer + .write_to_disk(&self_clone, &ctx, key_range) + .await? + else { + return Ok(None); + }; + // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + // + // We use fatal_err() below because the after write_to_disk returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self_clone .conf - .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) - .context("fsync of timeline dir")?; - - anyhow::Ok(new_delta) + .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id), + &ctx, + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + anyhow::Ok(Some(new_delta)) + }; + // Before tokio-epoll-uring, we ran write_to_disk & the sync_all inside spawn_blocking. + // Preserve that behavior to maintain the same behavior for `virtual_file_io_engine=std-fs`. + use crate::virtual_file::io_engine::IoEngine; + match crate::virtual_file::io_engine::get() { + IoEngine::NotSet => panic!("io engine not set"), + IoEngine::StdFs => { + let span = tracing::info_span!("blocking"); + tokio::task::spawn_blocking({ + move || Handle::current().block_on(work.instrument(span)) + }) + .await + .context("spawn_blocking") + .and_then(|x| x) } - }) - .await - .context("spawn_blocking") - .and_then(|x| x)?; - - Ok(new_delta) + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => work.await, + } } async fn repartition( @@ -2911,73 +4195,54 @@ impl Timeline { partition_size: u64, flags: EnumSet, ctx: &RequestContext, - ) -> anyhow::Result<(KeyPartitioning, Lsn)> { - { - let partitioning_guard = self.partitioning.lock().unwrap(); - let distance = lsn.0 - partitioning_guard.1 .0; - if partitioning_guard.1 != Lsn(0) - && distance <= self.repartition_threshold - && !flags.contains(CompactFlags::ForceRepartition) - { - debug!( - distance, - threshold = self.repartition_threshold, - "no repartitioning needed" - ); - return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); - } + ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> { + let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { + // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. + // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` + // and hence before the compaction task starts. + anyhow::bail!("repartition() called concurrently, this should not happen"); + }; + let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; + if lsn < *partition_lsn { + anyhow::bail!("repartition() called with LSN going backwards, this should not happen"); } - let keyspace = self.collect_keyspace(lsn, ctx).await?; - let partitioning = keyspace.partition(partition_size); - let mut partitioning_guard = self.partitioning.lock().unwrap(); - if lsn > partitioning_guard.1 { - *partitioning_guard = (partitioning, lsn); - } else { - warn!("Concurrent repartitioning of keyspace. This unexpected, but probably harmless"); + let distance = lsn.0 - partition_lsn.0; + if *partition_lsn != Lsn(0) + && distance <= self.repartition_threshold + && !flags.contains(CompactFlags::ForceRepartition) + { + debug!( + distance, + threshold = self.repartition_threshold, + "no repartitioning needed" + ); + return Ok(( + (dense_partition.clone(), sparse_partition.clone()), + *partition_lsn, + )); } + + let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); + let sparse_partitioning = SparseKeyPartitioning { + parts: vec![sparse_ks], + }; // no partitioning for metadata keys for now + *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn); + Ok((partitioning_guard.0.clone(), partitioning_guard.1)) } // Is it time to create a new image layer for the given partition? - async fn time_for_new_image_layer( - &self, - partition: &KeySpace, - lsn: Lsn, - ) -> anyhow::Result { + async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool { let threshold = self.get_image_creation_threshold(); let guard = self.layers.read().await; let layers = guard.layer_map(); let mut max_deltas = 0; - { - let wanted_image_layers = self.wanted_image_layers.lock().unwrap(); - if let Some((cutoff_lsn, wanted)) = &*wanted_image_layers { - let img_range = - partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; - if wanted.overlaps(&img_range) { - // - // gc_timeline only pays attention to image layers that are older than the GC cutoff, - // but create_image_layers creates image layers at last-record-lsn. - // So it's possible that gc_timeline wants a new image layer to be created for a key range, - // but the range is already covered by image layers at more recent LSNs. Before we - // create a new image layer, check if the range is already covered at more recent LSNs. - if !layers - .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))? - { - debug!( - "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})", - img_range.start, img_range.end, cutoff_lsn, lsn - ); - return Ok(true); - } - } - } - } - for part_range in &partition.ranges { - let image_coverage = layers.image_coverage(part_range, lsn)?; + let image_coverage = layers.image_coverage(part_range, lsn); for (img_range, last_img) in image_coverage { let img_lsn = if let Some(last_img) = last_img { last_img.get_lsn_range().end @@ -2998,7 +4263,7 @@ impl Timeline { // after we read last_record_lsn, which is passed here in the 'lsn' argument. if img_lsn < lsn { let num_deltas = - layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?; + layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold)); max_deltas = max_deltas.max(num_deltas); if num_deltas >= threshold { @@ -3006,7 +4271,7 @@ impl Timeline { "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", img_range.start, img_range.end, num_deltas, img_lsn, lsn ); - return Ok(true); + return true; } } } @@ -3016,62 +4281,52 @@ impl Timeline { max_deltas, "none of the partitioned ranges had >= {threshold} deltas" ); - Ok(false) + false } - #[tracing::instrument(skip_all, fields(%lsn, %force))] - async fn create_image_layers( - self: &Arc, - partitioning: &KeyPartitioning, + /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large, + /// so that at most one image layer will be produced from this function. + async fn create_image_layer_for_rel_blocks( + self: &Arc, + partition: &KeySpace, + mut image_layer_writer: ImageLayerWriter, lsn: Lsn, - force: bool, ctx: &RequestContext, - ) -> Result, PageReconstructError> { - let timer = self.metrics.create_images_time_histo.start_timer(); - let mut image_layers = Vec::new(); + img_range: Range, + start: Key, + ) -> Result { + let mut wrote_keys = false; - // We need to avoid holes between generated image layers. - // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one - // image layer with hole between them. In this case such layer can not be utilized by GC. - // - // How such hole between partitions can appear? - // if we have relation with relid=1 and size 100 and relation with relid=2 with size 200 then result of - // KeySpace::partition may contain partitions <100000000..100000099> and <200000000..200000199>. - // If there is delta layer <100000000..300000000> then it never be garbage collected because - // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. - let mut start = Key::MIN; + let mut key_request_accum = KeySpaceAccum::new(); + for range in &partition.ranges { + let mut key = range.start; + while key < range.end { + // Decide whether to retain this key: usually we do, but sharded tenants may + // need to drop keys that don't belong to them. If we retain the key, add it + // to `key_request_accum` for later issuing a vectored get + if self.shard_identity.is_key_disposable(&key) { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } else { + key_request_accum.add_key(key); + } - for partition in partitioning.parts.iter() { - let img_range = start..partition.ranges.last().unwrap().end; - start = img_range.end; - if force || self.time_for_new_image_layer(partition, lsn).await? { - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - &img_range, - lsn, - ) - .await?; + let last_key_in_range = key.next() == range.end; + key = key.next(); - fail_point!("image-layer-writer-fail-before-finish", |_| { - Err(PageReconstructError::Other(anyhow::anyhow!( - "failpoint image-layer-writer-fail-before-finish" - ))) - }); - for range in &partition.ranges { - let mut key = range.start; - while key < range.end { - if self.shard_identity.is_key_disposable(&key) { - debug!( - "Dropping key {} during compaction (it belongs on shard {:?})", - key, - self.shard_identity.get_shard_number(&key) - ); - key = key.next(); - continue; - } - let img = match self.get(key, lsn, ctx).await { + // Maybe flush `key_rest_accum` + if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS + || (last_key_in_range && key_request_accum.raw_size() > 0) + { + let results = self + .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx) + .await?; + + for (img_key, img) in results { + let img = match img { Ok(img) => img, Err(err) => { // If we fail to reconstruct a VM or FSM page, we can zero the @@ -3089,53 +4344,272 @@ impl Timeline { // Unfortunately we cannot do this for the main fork, or for // any metadata keys, keys, as that would lead to actual data // loss. - if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) { - warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}"); + if img_key.is_rel_fsm_block_key() || img_key.is_rel_vm_block_key() { + warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}"); ZERO_PAGE.clone() } else { - return Err(err); + return Err(CreateImageLayersError::PageReconstructError(err)); } } }; - image_layer_writer.put_image(key, &img).await?; - key = key.next(); + // Write all the keys we just read into our new image layer. + image_layer_writer.put_image(img_key, img, ctx).await?; + wrote_keys = true; } } - let image_layer = image_layer_writer.finish(self).await?; - image_layers.push(image_layer); } } - // All layers that the GC wanted us to create have now been created. - // - // It's possible that another GC cycle happened while we were compacting, and added - // something new to wanted_image_layers, and we now clear that before processing it. - // That's OK, because the next GC iteration will put it back in. - *self.wanted_image_layers.lock().unwrap() = None; - // Sync the new layer to disk before adding it to the layer map, to make sure - // we don't garbage collect something based on the new layer, before it has - // reached the disk. - // - // We must also fsync the timeline dir to ensure the directory entries for - // new layer files are durable - // - // Compaction creates multiple image layers. It would be better to create them all - // and fsync them all in parallel. - let all_paths = image_layers - .iter() - .map(|layer| layer.local_path().to_owned()) - .collect::>(); + if wrote_keys { + // Normal path: we have written some data into the new image layer for this + // partition, so flush it to disk. + let image_layer = image_layer_writer.finish(self, ctx).await?; + Ok(ImageLayerCreationOutcome { + image: Some(image_layer), + next_start_key: img_range.end, + }) + } else { + // Special case: the image layer may be empty if this is a sharded tenant and the + // partition does not cover any keys owned by this shard. In this case, to ensure + // we don't leave gaps between image layers, leave `start` where it is, so that the next + // layer we write will cover the key range that we just scanned. + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: start, + }) + } + } - par_fsync::par_fsync_async(&all_paths) + /// Create an image layer for metadata keys. This function produces one image layer for all metadata + /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it + /// would not be too large to fit in a single image layer. + #[allow(clippy::too_many_arguments)] + async fn create_image_layer_for_metadata_keys( + self: &Arc, + partition: &KeySpace, + mut image_layer_writer: ImageLayerWriter, + lsn: Lsn, + ctx: &RequestContext, + img_range: Range, + mode: ImageLayerCreationMode, + start: Key, + ) -> Result { + assert!(!matches!(mode, ImageLayerCreationMode::Initial)); + + // Metadata keys image layer creation. + let mut reconstruct_state = ValuesReconstructState::default(); + let data = self + .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) + .await?; + let (data, total_kb_retrieved, total_keys_retrieved) = { + let mut new_data = BTreeMap::new(); + let mut total_kb_retrieved = 0; + let mut total_keys_retrieved = 0; + for (k, v) in data { + let v = v.map_err(CreateImageLayersError::PageReconstructError)?; + total_kb_retrieved += KEY_SIZE + v.len(); + total_keys_retrieved += 1; + new_data.insert(k, v); + } + (new_data, total_kb_retrieved / 1024, total_keys_retrieved) + }; + let delta_files_accessed = reconstruct_state.get_delta_layers_visited(); + + let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS; + debug!( + trigger_generation, + delta_files_accessed, + total_kb_retrieved, + total_keys_retrieved, + "generate metadata images" + ); + + if !trigger_generation && mode == ImageLayerCreationMode::Try { + return Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: img_range.end, + }); + } + let mut wrote_any_image = false; + for (k, v) in data { + if v.is_empty() { + // the key has been deleted, it does not need an image + // in metadata keyspace, an empty image == tombstone + continue; + } + wrote_any_image = true; + + // No need to handle sharding b/c metadata keys are always on the 0-th shard. + + // TODO: split image layers to avoid too large layer files. Too large image files are not handled + // on the normal data path either. + image_layer_writer.put_image(k, v, ctx).await?; + } + + if wrote_any_image { + // Normal path: we have written some data into the new image layer for this + // partition, so flush it to disk. + let image_layer = image_layer_writer.finish(self, ctx).await?; + Ok(ImageLayerCreationOutcome { + image: Some(image_layer), + next_start_key: img_range.end, + }) + } else { + // Special case: the image layer may be empty if this is a sharded tenant and the + // partition does not cover any keys owned by this shard. In this case, to ensure + // we don't leave gaps between image layers, leave `start` where it is, so that the next + // layer we write will cover the key range that we just scanned. + tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + Ok(ImageLayerCreationOutcome { + image: None, + next_start_key: start, + }) + } + } + + #[tracing::instrument(skip_all, fields(%lsn, %mode))] + async fn create_image_layers( + self: &Arc, + partitioning: &KeyPartitioning, + lsn: Lsn, + mode: ImageLayerCreationMode, + ctx: &RequestContext, + ) -> Result, CreateImageLayersError> { + let timer = self.metrics.create_images_time_histo.start_timer(); + let mut image_layers = Vec::new(); + + // We need to avoid holes between generated image layers. + // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one + // image layer with hole between them. In this case such layer can not be utilized by GC. + // + // How such hole between partitions can appear? + // if we have relation with relid=1 and size 100 and relation with relid=2 with size 200 then result of + // KeySpace::partition may contain partitions <100000000..100000099> and <200000000..200000199>. + // If there is delta layer <100000000..300000000> then it never be garbage collected because + // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. + let mut start = Key::MIN; + + let check_for_image_layers = { + let last_checks_at = self.last_image_layer_creation_check_at.load(); + let distance = lsn + .checked_sub(last_checks_at) + .expect("Attempt to compact with LSN going backwards"); + let min_distance = self.get_image_layer_creation_check_threshold() as u64 + * self.get_checkpoint_distance(); + + // Skip the expensive delta layer counting if this timeline has not ingested sufficient + // WAL since the last check. + distance.0 >= min_distance + }; + + if check_for_image_layers { + self.last_image_layer_creation_check_at.store(lsn); + } + + for partition in partitioning.parts.iter() { + let img_range = start..partition.ranges.last().unwrap().end; + let compact_metadata = partition.overlaps(&Key::metadata_key_range()); + if compact_metadata { + for range in &partition.ranges { + assert!( + range.start.field1 >= METADATA_KEY_BEGIN_PREFIX + && range.end.field1 <= METADATA_KEY_END_PREFIX, + "metadata keys must be partitioned separately" + ); + } + if mode == ImageLayerCreationMode::Initial { + return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers"))); + } + if mode == ImageLayerCreationMode::Try && !check_for_image_layers { + // Skip compaction if there are not enough updates. Metadata compaction will do a scan and + // might mess up with evictions. + start = img_range.end; + continue; + } + } else if let ImageLayerCreationMode::Try = mode { + // check_for_image_layers = false -> skip + // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate + if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await { + start = img_range.end; + continue; + } + } + + let image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &img_range, + lsn, + ctx, + ) + .await?; + + fail_point!("image-layer-writer-fail-before-finish", |_| { + Err(CreateImageLayersError::Other(anyhow::anyhow!( + "failpoint image-layer-writer-fail-before-finish" + ))) + }); + + if !compact_metadata { + let ImageLayerCreationOutcome { + image, + next_start_key, + } = self + .create_image_layer_for_rel_blocks( + partition, + image_layer_writer, + lsn, + ctx, + img_range, + start, + ) + .await?; + + start = next_start_key; + image_layers.extend(image); + } else { + let ImageLayerCreationOutcome { + image, + next_start_key, + } = self + .create_image_layer_for_metadata_keys( + partition, + image_layer_writer, + lsn, + ctx, + img_range, + mode, + start, + ) + .await?; + start = next_start_key; + image_layers.extend(image); + } + } + + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + if !image_layers.is_empty() { + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ctx, + ) .await - .context("fsync of newly created layer files")?; - - par_fsync::par_fsync_async(&[self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) - .await - .context("fsync of timeline dir")?; + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } let mut guard = self.layers.write().await; @@ -3152,6 +4626,16 @@ impl Timeline { /// this Timeline is shut down. Calling this function will cause the initial /// logical size calculation to skip waiting for the background jobs barrier. pub(crate) async fn await_initial_logical_size(self: Arc) { + if !self.shard_identity.is_shard_zero() { + // We don't populate logical size on shard >0: skip waiting for it. + return; + } + + if self.remote_client.is_deleting() { + // The timeline was created in a deletion-resume state, we don't expect logical size to be populated + return; + } + if let Some(await_bg_cancel) = self .current_logical_size .cancel_wait_for_background_loop_concurrency_limit_semaphore @@ -3163,9 +4647,10 @@ impl Timeline { // the logical size cancellation to skip the concurrency limit semaphore. // TODO: this is an unexpected case. We should restructure so that it // can't happen. - tracing::info!( + tracing::warn!( "await_initial_logical_size: can't get semaphore cancel token, skipping" ); + debug_assert!(false); } tokio::select!( @@ -3173,12 +4658,56 @@ impl Timeline { _ = self.cancel.cancelled() => {} ) } -} -#[derive(Default)] -struct CompactLevel0Phase1Result { - new_layers: Vec, - deltas_to_compact: Vec, + /// Detach this timeline from its ancestor by copying all of ancestors layers as this + /// Timelines layers up to the ancestor_lsn. + /// + /// Requires a timeline that: + /// - has an ancestor to detach from + /// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not + /// a technical requirement + /// + /// After the operation has been started, it cannot be canceled. Upon restart it needs to be + /// polled again until completion. + /// + /// During the operation all timelines sharing the data with this timeline will be reparented + /// from our ancestor to be branches of this timeline. + pub(crate) async fn prepare_to_detach_from_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + options: detach_ancestor::Options, + ctx: &RequestContext, + ) -> Result< + ( + completion::Completion, + detach_ancestor::PreparedTimelineDetach, + ), + detach_ancestor::Error, + > { + detach_ancestor::prepare(self, tenant, options, ctx).await + } + + /// Completes the ancestor detach. This method is to be called while holding the + /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any + /// timeline be deleted. After this method returns successfully, tenant must be reloaded. + /// + /// Pageserver receiving a SIGKILL during this operation is not supported (yet). + pub(crate) async fn complete_detaching_timeline_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + prepared: detach_ancestor::PreparedTimelineDetach, + ctx: &RequestContext, + ) -> Result, anyhow::Error> { + detach_ancestor::complete(self, tenant, prepared, ctx).await + } + + /// Switch aux file policy and schedule upload to the index part. + pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> { + self.last_aux_file_policy.store(Some(policy)); + self.remote_client + .schedule_index_upload_for_aux_file_policy_update(Some(policy))?; + Ok(()) + } } /// Top-level failure to compact. @@ -3191,6 +4720,18 @@ pub(crate) enum CompactionError { Other(#[from] anyhow::Error), } +impl From for CompactionError { + fn from(err: CollectKeySpaceError) -> Self { + match err { + CollectKeySpaceError::Cancelled + | CollectKeySpaceError::PageRead(PageReconstructError::Cancelled) => { + CompactionError::ShuttingDown + } + e => CompactionError::Other(e.into()), + } + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); @@ -3203,7 +4744,7 @@ enum DurationRecorder { } impl DurationRecorder { - pub fn till_now(&self) -> DurationRecorder { + fn till_now(&self) -> DurationRecorder { match self { DurationRecorder::NotStarted => { panic!("must only call on recorded measurements") @@ -3214,7 +4755,7 @@ impl DurationRecorder { } } } - pub fn into_recorded(self) -> Option { + fn into_recorded(self) -> Option { match self { DurationRecorder::NotStarted => None, DurationRecorder::Recorded(recorded, _) => Some(recorded), @@ -3222,580 +4763,20 @@ impl DurationRecorder { } } -#[derive(Default)] -struct CompactLevel0Phase1StatsBuilder { - version: Option, - tenant_id: Option, - timeline_id: Option, - read_lock_acquisition_micros: DurationRecorder, - read_lock_held_spawn_blocking_startup_micros: DurationRecorder, - read_lock_held_key_sort_micros: DurationRecorder, - read_lock_held_prerequisites_micros: DurationRecorder, - read_lock_held_compute_holes_micros: DurationRecorder, - read_lock_drop_micros: DurationRecorder, - write_layer_files_micros: DurationRecorder, - level0_deltas_count: Option, - new_deltas_count: Option, - new_deltas_size: Option, -} - -#[derive(serde::Serialize)] -struct CompactLevel0Phase1Stats { - version: u64, - tenant_id: TenantShardId, - timeline_id: TimelineId, - read_lock_acquisition_micros: RecordedDuration, - read_lock_held_spawn_blocking_startup_micros: RecordedDuration, - read_lock_held_key_sort_micros: RecordedDuration, - read_lock_held_prerequisites_micros: RecordedDuration, - read_lock_held_compute_holes_micros: RecordedDuration, - read_lock_drop_micros: RecordedDuration, - write_layer_files_micros: RecordedDuration, - level0_deltas_count: usize, - new_deltas_count: usize, - new_deltas_size: u64, -} - -impl TryFrom for CompactLevel0Phase1Stats { - type Error = anyhow::Error; - - fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { - Ok(Self { - version: value.version.ok_or_else(|| anyhow!("version not set"))?, - tenant_id: value - .tenant_id - .ok_or_else(|| anyhow!("tenant_id not set"))?, - timeline_id: value - .timeline_id - .ok_or_else(|| anyhow!("timeline_id not set"))?, - read_lock_acquisition_micros: value - .read_lock_acquisition_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, - read_lock_held_spawn_blocking_startup_micros: value - .read_lock_held_spawn_blocking_startup_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, - read_lock_held_key_sort_micros: value - .read_lock_held_key_sort_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, - read_lock_held_prerequisites_micros: value - .read_lock_held_prerequisites_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, - read_lock_held_compute_holes_micros: value - .read_lock_held_compute_holes_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, - read_lock_drop_micros: value - .read_lock_drop_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, - write_layer_files_micros: value - .write_layer_files_micros - .into_recorded() - .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, - level0_deltas_count: value - .level0_deltas_count - .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, - new_deltas_count: value - .new_deltas_count - .ok_or_else(|| anyhow!("new_deltas_count not set"))?, - new_deltas_size: value - .new_deltas_size - .ok_or_else(|| anyhow!("new_deltas_size not set"))?, - }) - } -} - impl Timeline { - /// Level0 files first phase of compaction, explained in the [`Self::compact`] comment. - async fn compact_level0_phase1( + async fn finish_compact_batch( self: &Arc, - guard: tokio::sync::OwnedRwLockReadGuard, - mut stats: CompactLevel0Phase1StatsBuilder, - target_file_size: u64, - ctx: &RequestContext, - ) -> Result { - stats.read_lock_held_spawn_blocking_startup_micros = - stats.read_lock_acquisition_micros.till_now(); // set by caller - let layers = guard.layer_map(); - let level0_deltas = layers.get_level0_deltas()?; - let mut level0_deltas = level0_deltas - .into_iter() - .map(|x| guard.get_from_desc(&x)) - .collect_vec(); - stats.level0_deltas_count = Some(level0_deltas.len()); - // Only compact if enough layers have accumulated. - let threshold = self.get_compaction_threshold(); - if level0_deltas.is_empty() || level0_deltas.len() < threshold { - debug!( - level0_deltas = level0_deltas.len(), - threshold, "too few deltas to compact" - ); - return Ok(CompactLevel0Phase1Result::default()); - } - - // This failpoint is used together with `test_duplicate_layers` integration test. - // It returns the compaction result exactly the same layers as input to compaction. - // We want to ensure that this will not cause any problem when updating the layer map - // after the compaction is finished. - // - // Currently, there are two rare edge cases that will cause duplicated layers being - // inserted. - // 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which - // is compacted to 5, but the page server is shut down, next time we start page server we will get a layer - // map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this - // point again, it is likely that we will get a file 6 which has the same content and the key range as 5, - // and this causes an overwrite. This is acceptable because the content is the same, and we should do a - // layer replace instead of the normal remove / upload process. - // 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file - // size length. Compaction will likely create the same set of n files afterwards. - // - // This failpoint is a superset of both of the cases. - if cfg!(feature = "testing") { - let active = (|| { - ::fail::fail_point!("compact-level0-phase1-return-same", |_| true); - false - })(); - - if active { - let mut new_layers = Vec::with_capacity(level0_deltas.len()); - for delta in &level0_deltas { - // we are just faking these layers as being produced again for this failpoint - new_layers.push( - delta - .download_and_keep_resident() - .await - .context("download layer for failpoint")?, - ); - } - tracing::info!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint - return Ok(CompactLevel0Phase1Result { - new_layers, - deltas_to_compact: level0_deltas, - }); - } - } - - // Gather the files to compact in this iteration. - // - // Start with the oldest Level 0 delta file, and collect any other - // level 0 files that form a contiguous sequence, such that the end - // LSN of previous file matches the start LSN of the next file. - // - // Note that if the files don't form such a sequence, we might - // "compact" just a single file. That's a bit pointless, but it allows - // us to get rid of the level 0 file, and compact the other files on - // the next iteration. This could probably made smarter, but such - // "gaps" in the sequence of level 0 files should only happen in case - // of a crash, partial download from cloud storage, or something like - // that, so it's not a big deal in practice. - level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start); - let mut level0_deltas_iter = level0_deltas.iter(); - - let first_level0_delta = level0_deltas_iter.next().unwrap(); - let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; - let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); - - deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); - for l in level0_deltas_iter { - let lsn_range = &l.layer_desc().lsn_range; - - if lsn_range.start != prev_lsn_end { - break; - } - deltas_to_compact.push(l.download_and_keep_resident().await?); - prev_lsn_end = lsn_range.end; - } - let lsn_range = Range { - start: deltas_to_compact - .first() - .unwrap() - .layer_desc() - .lsn_range - .start, - end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end, - }; - - info!( - "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", - lsn_range.start, - lsn_range.end, - deltas_to_compact.len(), - level0_deltas.len() - ); - - for l in deltas_to_compact.iter() { - info!("compact includes {l}"); - } - - // We don't need the original list of layers anymore. Drop it so that - // we don't accidentally use it later in the function. - drop(level0_deltas); - - stats.read_lock_held_prerequisites_micros = stats - .read_lock_held_spawn_blocking_startup_micros - .till_now(); - - // Determine N largest holes where N is number of compacted layers. - let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); - let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; - let min_hole_coverage_size = 3; // TODO: something more flexible? - - // min-heap (reserve space for one more element added before eviction) - let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); - let mut prev: Option = None; - - let mut all_keys = Vec::new(); - - for l in deltas_to_compact.iter() { - all_keys.extend(l.load_keys(ctx).await?); - } - - // FIXME: should spawn_blocking the rest of this function - - // The current stdlib sorting implementation is designed in a way where it is - // particularly fast where the slice is made up of sorted sub-ranges. - all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); - - stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); - - for &DeltaEntry { key: next_key, .. } in all_keys.iter() { - if let Some(prev_key) = prev { - // just first fast filter - if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { - let key_range = prev_key..next_key; - // Measuring hole by just subtraction of i128 representation of key range boundaries - // has not so much sense, because largest holes will corresponds field1/field2 changes. - // But we are mostly interested to eliminate holes which cause generation of excessive image layers. - // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len(); - if coverage_size >= min_hole_coverage_size { - heap.push(Hole { - key_range, - coverage_size, - }); - if heap.len() > max_holes { - heap.pop(); // remove smallest hole - } - } - } - } - prev = Some(next_key.next()); - } - stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); - drop_rlock(guard); - stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); - let mut holes = heap.into_vec(); - holes.sort_unstable_by_key(|hole| hole.key_range.start); - let mut next_hole = 0; // index of next hole in holes vector - - // This iterator walks through all key-value pairs from all the layers - // we're compacting, in key, LSN order. - let all_values_iter = all_keys.iter(); - - // This iterator walks through all keys and is needed to calculate size used by each key - let mut all_keys_iter = all_keys - .iter() - .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size)) - .coalesce(|mut prev, cur| { - // Coalesce keys that belong to the same key pair. - // This ensures that compaction doesn't put them - // into different layer files. - // Still limit this by the target file size, - // so that we keep the size of the files in - // check. - if prev.0 == cur.0 && prev.2 < target_file_size { - prev.2 += cur.2; - Ok(prev) - } else { - Err((prev, cur)) - } - }); - - // Merge the contents of all the input delta layers into a new set - // of delta layers, based on the current partitioning. - // - // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. - // It's possible that there is a single key with so many page versions that storing all of them in a single layer file - // would be too large. In that case, we also split on the LSN dimension. - // - // LSN - // ^ - // | - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | | | | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // - // - // If one key (X) has a lot of page versions: - // - // LSN - // ^ - // | (X) - // | +-----------+ +--+--+--+--+ - // | | | | | | | | - // | +-----------+ | | +--+ | - // | | | | | | | | - // | +-----------+ ==> | | | | | - // | | | | | +--+ | - // | +-----------+ | | | | | - // | | | | | | | | - // | +-----------+ +--+--+--+--+ - // | - // +--------------> key - // TODO: this actually divides the layers into fixed-size chunks, not - // based on the partitioning. - // - // TODO: we should also opportunistically materialize and - // garbage collect what we can. - let mut new_layers = Vec::new(); - let mut prev_key: Option = None; - let mut writer: Option = None; - let mut key_values_total_size = 0u64; - let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key - let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key - - for &DeltaEntry { - key, lsn, ref val, .. - } in all_values_iter - { - let value = val.load(ctx).await?; - let same_key = prev_key.map_or(false, |prev_key| prev_key == key); - // We need to check key boundaries once we reach next key or end of layer with the same key - if !same_key || lsn == dup_end_lsn { - let mut next_key_size = 0u64; - let is_dup_layer = dup_end_lsn.is_valid(); - dup_start_lsn = Lsn::INVALID; - if !same_key { - dup_end_lsn = Lsn::INVALID; - } - // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size - for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { - next_key_size = next_size; - if key != next_key { - if dup_end_lsn.is_valid() { - // We are writting segment with duplicates: - // place all remaining values of this key in separate segment - dup_start_lsn = dup_end_lsn; // new segments starts where old stops - dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range - } - break; - } - key_values_total_size += next_size; - // Check if it is time to split segment: if total keys size is larger than target file size. - // We need to avoid generation of empty segments if next_size > target_file_size. - if key_values_total_size > target_file_size && lsn != next_lsn { - // Split key between multiple layers: such layer can contain only single key - dup_start_lsn = if dup_end_lsn.is_valid() { - dup_end_lsn // new segment with duplicates starts where old one stops - } else { - lsn // start with the first LSN for this key - }; - dup_end_lsn = next_lsn; // upper LSN boundary is exclusive - break; - } - } - // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. - if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { - dup_start_lsn = dup_end_lsn; - dup_end_lsn = lsn_range.end; - } - if writer.is_some() { - let written_size = writer.as_mut().unwrap().size(); - let contains_hole = - next_hole < holes.len() && key >= holes[next_hole].key_range.end; - // check if key cause layer overflow or contains hole... - if is_dup_layer - || dup_end_lsn.is_valid() - || written_size + key_values_total_size > target_file_size - || contains_hole - { - // ... if so, flush previous layer and prepare to write new one - new_layers.push( - writer - .take() - .unwrap() - .finish(prev_key.unwrap().next(), self) - .await?, - ); - writer = None; - - if contains_hole { - // skip hole - next_hole += 1; - } - } - } - // Remember size of key value because at next iteration we will access next item - key_values_total_size = next_key_size; - } - if writer.is_none() { - // Create writer if not initiaized yet - writer = Some( - DeltaLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - key, - if dup_end_lsn.is_valid() { - // this is a layer containing slice of values of the same key - debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); - dup_start_lsn..dup_end_lsn - } else { - debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); - lsn_range.clone() - }, - ) - .await?, - ); - } - - fail_point!("delta-layer-writer-fail-before-finish", |_| { - Err(CompactionError::Other(anyhow::anyhow!( - "failpoint delta-layer-writer-fail-before-finish" - ))) - }); - - if !self.shard_identity.is_key_disposable(&key) { - writer.as_mut().unwrap().put_value(key, lsn, value).await?; - } else { - debug!( - "Dropping key {} during compaction (it belongs on shard {:?})", - key, - self.shard_identity.get_shard_number(&key) - ); - } - - if !new_layers.is_empty() { - fail_point!("after-timeline-compacted-first-L1"); - } - - prev_key = Some(key); - } - if let Some(writer) = writer { - new_layers.push(writer.finish(prev_key.unwrap().next(), self).await?); - } - - // Sync layers - if !new_layers.is_empty() { - // Print a warning if the created layer is larger than double the target size - // Add two pages for potential overhead. This should in theory be already - // accounted for in the target calculation, but for very small targets, - // we still might easily hit the limit otherwise. - let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2; - for layer in new_layers.iter() { - if layer.layer_desc().file_size > warn_limit { - warn!( - %layer, - "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size - ); - } - } - - // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here - let layer_paths: Vec = new_layers - .iter() - .map(|l| l.local_path().to_owned()) - .collect(); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync_async(&layer_paths) - .await - .context("fsync all new layers")?; - - let timeline_dir = self - .conf - .timeline_path(&self.tenant_shard_id, &self.timeline_id); - - par_fsync::par_fsync_async(&[timeline_dir]) - .await - .context("fsync of timeline dir")?; - } - - stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now(); - stats.new_deltas_count = Some(new_layers.len()); - stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum()); - - match TryInto::::try_into(stats) - .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string")) - { - Ok(stats_json) => { - info!( - stats_json = stats_json.as_str(), - "compact_level0_phase1 stats available" - ) - } - Err(e) => { - warn!("compact_level0_phase1 stats failed to serialize: {:#}", e); - } - } - - Ok(CompactLevel0Phase1Result { - new_layers, - deltas_to_compact: deltas_to_compact - .into_iter() - .map(|x| x.drop_eviction_guard()) - .collect::>(), - }) - } - - /// - /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as - /// as Level 1 files. - /// - async fn compact_level0( - self: &Arc, - target_file_size: u64, - ctx: &RequestContext, - ) -> Result<(), CompactionError> { - let CompactLevel0Phase1Result { - new_layers, - deltas_to_compact, - } = { - let phase1_span = info_span!("compact_level0_phase1"); - let ctx = ctx.attached_child(); - let mut stats = CompactLevel0Phase1StatsBuilder { - version: Some(2), - tenant_id: Some(self.tenant_shard_id), - timeline_id: Some(self.timeline_id), - ..Default::default() - }; - - let begin = tokio::time::Instant::now(); - let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; - let now = tokio::time::Instant::now(); - stats.read_lock_acquisition_micros = - DurationRecorder::Recorded(RecordedDuration(now - begin), now); - self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) - .instrument(phase1_span) - .await? - }; - - if new_layers.is_empty() && deltas_to_compact.is_empty() { - // nothing to do - return Ok(()); - } - + new_deltas: &[ResidentLayer], + new_images: &[ResidentLayer], + layers_to_remove: &[Layer], + ) -> anyhow::Result<()> { let mut guard = self.layers.write().await; let mut duplicated_layers = HashSet::new(); - let mut insert_layers = Vec::with_capacity(new_layers.len()); + let mut insert_layers = Vec::with_capacity(new_deltas.len()); - for l in &new_layers { + for l in new_deltas { if guard.contains(l.as_ref()) { // expected in tests tracing::error!(layer=%l, "duplicated L1 layer"); @@ -3806,32 +4787,73 @@ impl Timeline { // because we have not implemented L0 => L0 compaction. duplicated_layers.insert(l.layer_desc().key()); } else if LayerMap::is_l0(l.layer_desc()) { - return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction."))); + bail!("compaction generates a L0 layer file as output, which will cause infinite compaction."); } else { insert_layers.push(l.clone()); } } - let remove_layers = { - let mut deltas_to_compact = deltas_to_compact; - // only remove those inputs which were not outputs - deltas_to_compact.retain(|l| !duplicated_layers.contains(&l.layer_desc().key())); - deltas_to_compact - }; + // only remove those inputs which were not outputs + let remove_layers: Vec = layers_to_remove + .iter() + .filter(|l| !duplicated_layers.contains(&l.layer_desc().key())) + .cloned() + .collect(); + + if !new_images.is_empty() { + guard.track_new_image_layers(new_images, &self.metrics); + } // deletion will happen later, the layer file manager calls garbage_collect_on_drop guard.finish_compact_l0(&remove_layers, &insert_layers, &self.metrics); - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_compaction_update(&remove_layers, &new_layers)?; - } + self.remote_client + .schedule_compaction_update(&remove_layers, new_deltas)?; drop_wlock(guard); Ok(()) } - /// Update information about which layer files need to be retained on + async fn rewrite_layers( + self: &Arc, + mut replace_layers: Vec<(Layer, ResidentLayer)>, + mut drop_layers: Vec, + ) -> anyhow::Result<()> { + let mut guard = self.layers.write().await; + + // Trim our lists in case our caller (compaction) raced with someone else (GC) removing layers: we want + // to avoid double-removing, and avoid rewriting something that was removed. + replace_layers.retain(|(l, _)| guard.contains(l)); + drop_layers.retain(|l| guard.contains(l)); + + guard.rewrite_layers(&replace_layers, &drop_layers, &self.metrics); + + let upload_layers: Vec<_> = replace_layers.into_iter().map(|r| r.1).collect(); + + self.remote_client + .schedule_compaction_update(&drop_layers, &upload_layers)?; + + Ok(()) + } + + /// Schedules the uploads of the given image layers + fn upload_new_image_layers( + self: &Arc, + new_images: impl IntoIterator, + ) -> anyhow::Result<()> { + for layer in new_images { + self.remote_client.schedule_layer_file_upload(layer)?; + } + // should any new image layer been created, not uploading index_part will + // result in a mismatch between remote_physical_size and layermap calculated + // size, which will fail some tests, but should not be an issue otherwise. + self.remote_client + .schedule_index_upload_for_file_changes()?; + Ok(()) + } + + /// Find the Lsns above which layer files need to be retained on /// garbage collection. This is separate from actually performing the GC, /// and is updated more frequently, so that compaction can remove obsolete /// page versions more aggressively. @@ -3839,17 +4861,6 @@ impl Timeline { /// TODO: that's wishful thinking, compaction doesn't actually do that /// currently. /// - /// The caller specifies how much history is needed with the 3 arguments: - /// - /// retain_lsns: keep a version of each page at these LSNs - /// cutoff_horizon: also keep everything newer than this LSN - /// pitr: the time duration required to keep data for PITR - /// - /// The 'retain_lsns' list is currently used to prevent removing files that - /// are needed by child timelines. In the future, the user might be able to - /// name additional points in time to retain. The caller is responsible for - /// collecting that information. - /// /// The 'cutoff_horizon' point is used to retain recent versions that might still be /// needed by read-only nodes. (As of this writing, the caller just passes /// the latest LSN subtracted by a constant, and doesn't do anything smart @@ -3857,23 +4868,22 @@ impl Timeline { /// /// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine /// whether a record is needed for PITR. - /// - /// NOTE: This function holds a short-lived lock to protect the 'gc_info' - /// field, so that the three values passed as argument are stored - /// atomically. But the caller is responsible for ensuring that no new - /// branches are created that would need to be included in 'retain_lsns', - /// for example. The caller should hold `Tenant::gc_cs` lock to ensure - /// that. - /// #[instrument(skip_all, fields(timeline_id=%self.timeline_id))] - pub(super) async fn update_gc_info( + pub(super) async fn find_gc_cutoffs( &self, - retain_lsns: Vec, cutoff_horizon: Lsn, pitr: Duration, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result { + let _timer = self + .metrics + .find_gc_cutoffs_histo + .start_timer() + .record_on_drop(); + + pausable_failpoint!("Timeline::find_gc_cutoffs-pausable"); + // First, calculate pitr_cutoff_timestamp and then convert it to LSN. // // Some unit tests depend on garbage-collection working even when @@ -3894,8 +4904,11 @@ impl Timeline { // The timestamp is in the future. That sounds impossible, // but what it really means is that there hasn't been // any commits since the cutoff timestamp. + // + // In this case we should use the LSN of the most recent commit, + // which is implicitly the last LSN in the log. debug!("future({})", lsn); - cutoff_horizon + self.get_last_record_lsn() } LsnForTimestamp::Past(lsn) => { debug!("past({})", lsn); @@ -3916,19 +4929,14 @@ impl Timeline { *self.get_latest_gc_cutoff_lsn() } } else { - // No time-based retention was configured. Set time-based cutoff to - // same as LSN based. - cutoff_horizon + // No time-based retention was configured. Interpret this as "keep no history". + self.get_last_record_lsn() }; - // Grab the lock and update the values - *self.gc_info.write().unwrap() = GcInfo { - retain_lsns, - horizon_cutoff: cutoff_horizon, - pitr_cutoff, - }; - - Ok(()) + Ok(GcCutoffs { + horizon: cutoff_horizon, + pitr: pitr_cutoff, + }) } /// Garbage collect layer files on a timeline that are no longer needed. @@ -3936,14 +4944,12 @@ impl Timeline { /// Currently, we don't make any attempt at removing unneeded page versions /// within a layer file. We can only remove the whole file if it's fully /// obsolete. - pub(super) async fn gc(&self) -> anyhow::Result { + pub(super) async fn gc(&self) -> Result { // this is most likely the background tasks, but it might be the spawned task from // immediate_gc - let cancel = crate::task_mgr::shutdown_token(); let _g = tokio::select! { guard = self.gc_lock.lock() => guard, _ = self.cancel.cancelled() => return Ok(GcResult::default()), - _ = cancel.cancelled() => return Ok(GcResult::default()), }; let timer = self.metrics.garbage_collect_histo.start_timer(); @@ -3951,22 +4957,65 @@ impl Timeline { // Is the timeline being deleted? if self.is_stopping() { - anyhow::bail!("timeline is Stopping"); + return Err(GcError::TimelineCancelled); } - let (horizon_cutoff, pitr_cutoff, retain_lsns) = { + let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = { let gc_info = self.gc_info.read().unwrap(); - let horizon_cutoff = min(gc_info.horizon_cutoff, self.get_disk_consistent_lsn()); - let pitr_cutoff = gc_info.pitr_cutoff; + let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn()); + let pitr_cutoff = gc_info.cutoffs.pitr; let retain_lsns = gc_info.retain_lsns.clone(); - (horizon_cutoff, pitr_cutoff, retain_lsns) + + // Gets the maximum LSN that holds the valid lease. + // + // Caveat: `refresh_gc_info` is in charged of updating the lease map. + // Here, we do not check for stale leases again. + let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn); + + ( + horizon_cutoff, + pitr_cutoff, + retain_lsns, + max_lsn_with_valid_lease, + ) }; - let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff); + let standby_horizon = self.standby_horizon.load(); + // Hold GC for the standby, but as a safety guard do it only within some + // reasonable lag. + if standby_horizon != Lsn::INVALID { + if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) { + const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB + if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG { + new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff); + trace!("holding off GC for standby apply LSN {}", standby_horizon); + } else { + warn!( + "standby is lagging for more than {}MB, not holding gc for it", + MAX_ALLOWED_STANDBY_LAG / 1024 / 1024 + ) + } + } + } + + // Reset standby horizon to ignore it if it is not updated till next GC. + // It is an easy way to unset it when standby disappears without adding + // more conf options. + self.standby_horizon.store(Lsn::INVALID); + self.metrics + .standby_horizon_gauge + .set(Lsn::INVALID.0 as i64); let res = self - .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff) + .gc_timeline( + horizon_cutoff, + pitr_cutoff, + retain_lsns, + max_lsn_with_valid_lease, + new_gc_cutoff, + ) .instrument( info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff), ) @@ -3983,8 +5032,11 @@ impl Timeline { horizon_cutoff: Lsn, pitr_cutoff: Lsn, retain_lsns: Vec, + max_lsn_with_valid_lease: Option, new_gc_cutoff: Lsn, - ) -> anyhow::Result { + ) -> Result { + // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc + let now = SystemTime::now(); let mut result: GcResult = GcResult::default(); @@ -4004,12 +5056,15 @@ impl Timeline { // The GC cutoff should only ever move forwards. let waitlist = { let write_guard = self.latest_gc_cutoff_lsn.lock_for_write(); - ensure!( - *write_guard <= new_gc_cutoff, - "Cannot move GC cutoff LSN backwards (was {}, new {})", - *write_guard, - new_gc_cutoff - ); + if *write_guard > new_gc_cutoff { + return Err(GcError::BadLsn { + why: format!( + "Cannot move GC cutoff LSN backwards (was {}, new {})", + *write_guard, new_gc_cutoff + ), + }); + } + write_guard.store_and_unlock(new_gc_cutoff) }; waitlist.wait().await; @@ -4019,7 +5074,6 @@ impl Timeline { debug!("retain_lsns: {:?}", retain_lsns); let mut layers_to_remove = Vec::new(); - let mut wanted_image_layers = KeySpaceRandomAccum::default(); // Scan all layers in the timeline (remote or on-disk). // @@ -4027,7 +5081,8 @@ impl Timeline { // 1. it is older than cutoff LSN; // 2. it is older than PITR interval; // 3. it doesn't need to be retained for 'retain_lsns'; - // 4. newer on-disk image layers cover the layer's whole key range + // 4. it does not need to be kept for LSNs holding valid leases. + // 5. newer on-disk image layers cover the layer's whole key range // // TODO holding a write lock is too agressive and avoidable let mut guard = self.layers.write().await; @@ -4039,7 +5094,7 @@ impl Timeline { if l.get_lsn_range().end > horizon_cutoff { debug!( "keeping {} because it's newer than horizon_cutoff {}", - l.filename(), + l.layer_name(), horizon_cutoff, ); result.layers_needed_by_cutoff += 1; @@ -4050,7 +5105,7 @@ impl Timeline { if l.get_lsn_range().end > pitr_cutoff { debug!( "keeping {} because it's newer than pitr_cutoff {}", - l.filename(), + l.layer_name(), pitr_cutoff, ); result.layers_needed_by_pitr += 1; @@ -4069,7 +5124,7 @@ impl Timeline { if &l.get_lsn_range().start <= retain_lsn { debug!( "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), retain_lsn, l.is_incremental(), ); @@ -4078,7 +5133,21 @@ impl Timeline { } } - // 4. Is there a later on-disk layer for this relation? + // 4. Is there a valid lease that requires us to keep this layer? + if let Some(lsn) = &max_lsn_with_valid_lease { + // keep if layer start <= any of the lease + if &l.get_lsn_range().start <= lsn { + debug!( + "keeping {} because there is a valid lease preventing GC at {}", + l.layer_name(), + lsn, + ); + result.layers_needed_by_leases += 1; + continue 'outer; + } + } + + // 5. Is there a later on-disk layer for this relation? // // The end-LSN is exclusive, while disk_consistent_lsn is // inclusive. For example, if disk_consistent_lsn is 100, it is @@ -4098,18 +5167,9 @@ impl Timeline { // we cannot remove C, even though it's older than 2500, because // the delta layer 2000-3000 depends on it. if !layers - .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))? + .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff)) { - debug!("keeping {} because it is the latest layer", l.filename()); - // Collect delta key ranges that need image layers to allow garbage - // collecting the layers. - // It is not so obvious whether we need to propagate information only about - // delta layers. Image layers can form "stairs" preventing old image from been deleted. - // But image layers are in any case less sparse than delta layers. Also we need some - // protection from replacing recent image layers with new one after each GC iteration. - if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&l) { - wanted_image_layers.add_range(l.get_key_range()); - } + debug!("keeping {} because it is the latest layer", l.layer_name()); result.layers_not_updated += 1; continue 'outer; } @@ -4117,29 +5177,25 @@ impl Timeline { // We didn't find any reason to keep this file, so remove it. debug!( "garbage collecting {} is_dropped: xx is_incremental: {}", - l.filename(), + l.layer_name(), l.is_incremental(), ); layers_to_remove.push(l); } - self.wanted_image_layers - .lock() - .unwrap() - .replace((new_gc_cutoff, wanted_image_layers.to_keyspace())); if !layers_to_remove.is_empty() { - // Persist the new GC cutoff value in the metadata file, before - // we actually remove anything. - // - // This does not in fact have any effect as we no longer consider local metadata unless - // running without remote storage. - // + // Persist the new GC cutoff value before we actually remove anything. // This unconditionally schedules also an index_part.json update, even though, we will // be doing one a bit later with the unlinked gc'd layers. - // - // TODO: remove when implementing . - self.update_metadata_file(self.disk_consistent_lsn.load(), None) - .await?; + let disk_consistent_lsn = self.disk_consistent_lsn.load(); + self.schedule_uploads(disk_consistent_lsn, None) + .map_err(|e| { + if self.cancel.is_cancelled() { + GcError::TimelineCancelled + } else { + GcError::Remote(e) + } + })?; let gc_layers = layers_to_remove .iter() @@ -4148,16 +5204,18 @@ impl Timeline { result.layers_removed = gc_layers.len() as u64; - if let Some(remote_client) = self.remote_client.as_ref() { - remote_client.schedule_gc_update(&gc_layers)?; - } + self.remote_client + .schedule_gc_update(&gc_layers) + .map_err(|e| { + if self.cancel.is_cancelled() { + GcError::TimelineCancelled + } else { + GcError::Remote(e) + } + })?; guard.finish_gc_timeline(&gc_layers); - if result.layers_removed != 0 { - fail_point!("after-timeline-gc-removed-layers"); - } - #[cfg(feature = "testing")] { result.doomed_layers = gc_layers; @@ -4169,7 +5227,7 @@ impl Timeline { result.layers_removed, new_gc_cutoff ); - result.elapsed = now.elapsed()?; + result.elapsed = now.elapsed().unwrap_or(Duration::ZERO); Ok(result) } @@ -4226,9 +5284,12 @@ impl Timeline { let img = match self .walredo_mgr + .as_ref() + .context("timeline has no walredo manager") + .map_err(PageReconstructError::WalRedo)? .request_redo(key, request_lsn, data.img, data.records, self.pg_version) .await - .context("Failed to reconstruct a page image:") + .context("reconstruct a page image") { Ok(img) => img, Err(e) => return Err(PageReconstructError::WalRedo(e)), @@ -4414,7 +5475,9 @@ impl Timeline { } } - pub fn get_download_all_remote_layers_task_info(&self) -> Option { + pub(crate) fn get_download_all_remote_layers_task_info( + &self, + ) -> Option { self.download_all_remote_layers_task_info .read() .unwrap() @@ -4422,81 +5485,27 @@ impl Timeline { } } -pub(crate) struct DiskUsageEvictionInfo { - /// Timeline's largest layer (remote or resident) - pub max_layer_size: Option, - /// Timeline's resident layers - pub resident_layers: Vec, -} - -pub(crate) struct LocalLayerInfoForDiskUsageEviction { - pub layer: Layer, - pub last_activity_ts: SystemTime, -} - -impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it - // having to allocate a string to this is bad, but it will rarely be formatted - let ts = chrono::DateTime::::from(self.last_activity_ts); - let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true); - struct DisplayIsDebug<'a, T>(&'a T); - impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.0) - } - } - f.debug_struct("LocalLayerInfoForDiskUsageEviction") - .field("layer", &DisplayIsDebug(&self.layer)) - .field("last_activity", &ts) - .finish() - } -} - -impl LocalLayerInfoForDiskUsageEviction { - pub fn file_size(&self) -> u64 { - self.layer.layer_desc().file_size - } -} - impl Timeline { /// Returns non-remote layers for eviction. pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let guard = self.layers.read().await; - let layers = guard.layer_map(); - let mut max_layer_size: Option = None; - let mut resident_layers = Vec::new(); - for l in layers.iter_historic_layers() { - let file_size = l.file_size(); - max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + let resident_layers = guard + .likely_resident_layers() + .map(|layer| { + let file_size = layer.layer_desc().file_size; + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); - let l = guard.get_from_desc(&l); + let last_activity_ts = layer.access_stats().latest_activity_or_now(); - let l = match l.keep_resident().await { - Ok(Some(l)) => l, - Ok(None) => continue, - Err(e) => { - // these should not happen, but we cannot make them statically impossible right - // now. - tracing::warn!(layer=%l, "failed to keep the layer resident: {e:#}"); - continue; + EvictionCandidate { + layer: layer.into(), + last_activity_ts, + relative_last_activity: finite_f32::FiniteF32::ZERO, } - }; - - let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| { - // We only use this fallback if there's an implementation error. - // `latest_activity` already does rate-limited warn!() log. - debug!(layer=%l, "last_activity returns None, using SystemTime::now"); - SystemTime::now() - }); - - resident_layers.push(LocalLayerInfoForDiskUsageEviction { - layer: l.drop_eviction_guard(), - last_activity_ts, - }); - } + }) + .collect(); DiskUsageEvictionInfo { max_layer_size, @@ -4510,45 +5519,195 @@ impl Timeline { shard_count: self.tenant_shard_id.shard_count, } } + + #[cfg(test)] + pub(super) fn force_advance_lsn(self: &Arc, new_lsn: Lsn) { + self.last_record_lsn.advance(new_lsn); + } + + #[cfg(test)] + pub(super) fn force_set_disk_consistent_lsn(&self, new_value: Lsn) { + self.disk_consistent_lsn.store(new_value); + } + + /// Force create an image layer and place it into the layer map. + /// + /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`] + /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run. + #[cfg(test)] + pub(super) async fn force_create_image_layer( + self: &Arc, + lsn: Lsn, + mut images: Vec<(Key, Bytes)>, + check_start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + assert!( + lsn <= last_record_lsn, + "advance last record lsn before inserting a layer, lsn={lsn}, last_record_lsn={last_record_lsn}" + ); + if let Some(check_start_lsn) = check_start_lsn { + assert!(lsn >= check_start_lsn); + } + images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb)); + let min_key = *images.first().map(|(k, _)| k).unwrap(); + let max_key = images.last().map(|(k, _)| k).unwrap().next(); + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(min_key..max_key), + lsn, + ctx, + ) + .await?; + for (key, img) in images { + image_layer_writer.put_image(key, img, ctx).await?; + } + let image_layer = image_layer_writer.finish(self, ctx).await?; + + { + let mut guard = self.layers.write().await; + guard.force_insert_layer(image_layer); + } + + Ok(()) + } + + /// Force create a delta layer and place it into the layer map. + /// + /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`] + /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are placed into the layer map in one run. + #[cfg(test)] + pub(super) async fn force_create_delta_layer( + self: &Arc, + mut deltas: Vec<(Key, Lsn, Value)>, + check_start_lsn: Option, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let last_record_lsn = self.get_last_record_lsn(); + deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); + let min_key = *deltas.first().map(|(k, _, _)| k).unwrap(); + let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next(); + let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); + let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); + assert!( + max_lsn <= last_record_lsn, + "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}" + ); + let end_lsn = Lsn(max_lsn.0 + 1); + if let Some(check_start_lsn) = check_start_lsn { + assert!(min_lsn >= check_start_lsn); + } + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + min_key, + min_lsn..end_lsn, + ctx, + ) + .await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?; + + { + let mut guard = self.layers.write().await; + guard.force_insert_layer(delta_layer); + } + + Ok(()) + } + + /// Return all keys at the LSN in the image layers + #[cfg(test)] + pub(crate) async fn inspect_image_layers( + self: &Arc, + lsn: Lsn, + ctx: &RequestContext, + ) -> anyhow::Result> { + let mut all_data = Vec::new(); + let guard = self.layers.read().await; + for layer in guard.layer_map().iter_historic_layers() { + if !layer.is_delta() && layer.image_layer_lsn() == lsn { + let layer = guard.get_from_desc(&layer); + let mut reconstruct_data = ValuesReconstructState::default(); + layer + .get_values_reconstruct_data( + KeySpace::single(Key::MIN..Key::MAX), + lsn..Lsn(lsn.0 + 1), + &mut reconstruct_data, + ctx, + ) + .await?; + for (k, v) in reconstruct_data.keys { + all_data.push((k, v?.img.unwrap().1)); + } + } + } + all_data.sort(); + Ok(all_data) + } + + /// Get all historic layer descriptors in the layer map + #[cfg(test)] + pub(crate) async fn inspect_historic_layers( + self: &Arc, + ) -> anyhow::Result> { + let mut layers = Vec::new(); + let guard = self.layers.read().await; + for layer in guard.layer_map().iter_historic_layers() { + layers.push(layer.key()); + } + Ok(layers) + } + + #[cfg(test)] + pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) { + let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone(); + keyspace.merge(&ks); + self.extra_test_dense_keyspace.store(Arc::new(keyspace)); + } } -type TraversalPathItem = ( - ValueReconstructResult, - Lsn, - Box TraversalId>, -); +type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); -/// Helper function for get_reconstruct_data() to add the path of layers traversed -/// to an error, as anyhow context information. -fn layer_traversal_error(msg: String, path: Vec) -> PageReconstructError { - // We want the original 'msg' to be the outermost context. The outermost context - // is the most high-level information, which also gets propagated to the client. - let mut msg_iter = path - .into_iter() - .map(|(r, c, l)| { - format!( - "layer traversal: result {:?}, cont_lsn {}, layer: {}", - r, - c, - l(), - ) - }) - .chain(std::iter::once(msg)); - // Construct initial message from the first traversed layer - let err = anyhow!(msg_iter.next().unwrap()); +/// Tracking writes ingestion does to a particular in-memory layer. +/// +/// Cleared upon freezing a layer. +struct TimelineWriterState { + open_layer: Arc, + current_size: u64, + // Previous Lsn which passed through + prev_lsn: Option, + // Largest Lsn which passed through the current writer + max_lsn: Option, + // Cached details of the last freeze. Avoids going trough the atomic/lock on every put. + cached_last_freeze_at: Lsn, +} - // Append all subsequent traversals, and the error message 'msg', as contexts. - let msg = msg_iter.fold(err, |err, msg| err.context(msg)); - PageReconstructError::from(msg) +impl TimelineWriterState { + fn new(open_layer: Arc, current_size: u64, last_freeze_at: Lsn) -> Self { + Self { + open_layer, + current_size, + prev_lsn: None, + max_lsn: None, + cached_last_freeze_at: last_freeze_at, + } + } } /// Various functions to mutate the timeline. // TODO Currently, Deref is used to allow easy access to read methods from this trait. // This is probably considered a bad practice in Rust and should be fixed eventually, // but will cause large code changes. -pub struct TimelineWriter<'a> { +pub(crate) struct TimelineWriter<'a> { tl: &'a Timeline, - _write_guard: tokio::sync::MutexGuard<'a, ()>, + write_guard: tokio::sync::MutexGuard<'a, Option>, } impl Deref for TimelineWriter<'_> { @@ -4559,31 +5718,180 @@ impl Deref for TimelineWriter<'_> { } } +#[derive(PartialEq)] +enum OpenLayerAction { + Roll, + Open, + None, +} + impl<'a> TimelineWriter<'a> { /// Put a new page version that can be constructed from a WAL record /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - pub async fn put( - &self, + pub(crate) async fn put( + &mut self, key: Key, lsn: Lsn, value: &Value, ctx: &RequestContext, ) -> anyhow::Result<()> { - self.tl.put_value(key, lsn, value, ctx).await + // Avoid doing allocations for "small" values. + // In the regression test suite, the limit of 256 avoided allocations in 95% of cases: + // https://github.com/neondatabase/neon/pull/5056#discussion_r1301975061 + let mut buf = smallvec::SmallVec::<[u8; 256]>::new(); + value.ser_into(&mut buf)?; + let buf_size: u64 = buf.len().try_into().expect("oversized value buf"); + + let action = self.get_open_layer_action(lsn, buf_size); + let layer = self.handle_open_layer_action(lsn, action, ctx).await?; + let res = layer.put_value(key, lsn, &buf, ctx).await; + + if res.is_ok() { + // Update the current size only when the entire write was ok. + // In case of failures, we may have had partial writes which + // render the size tracking out of sync. That's ok because + // the checkpoint distance should be significantly smaller + // than the S3 single shot upload limit of 5GiB. + let state = self.write_guard.as_mut().unwrap(); + + state.current_size += buf_size; + state.prev_lsn = Some(lsn); + state.max_lsn = std::cmp::max(state.max_lsn, Some(lsn)); + } + + res } + async fn handle_open_layer_action( + &mut self, + at: Lsn, + action: OpenLayerAction, + ctx: &RequestContext, + ) -> anyhow::Result<&Arc> { + match action { + OpenLayerAction::Roll => { + let freeze_at = self.write_guard.as_ref().unwrap().max_lsn.unwrap(); + self.roll_layer(freeze_at).await?; + self.open_layer(at, ctx).await?; + } + OpenLayerAction::Open => self.open_layer(at, ctx).await?, + OpenLayerAction::None => { + assert!(self.write_guard.is_some()); + } + } + + Ok(&self.write_guard.as_ref().unwrap().open_layer) + } + + async fn open_layer(&mut self, at: Lsn, ctx: &RequestContext) -> anyhow::Result<()> { + let layer = self.tl.get_layer_for_write(at, ctx).await?; + let initial_size = layer.size().await?; + + let last_freeze_at = self.last_freeze_at.load(); + self.write_guard.replace(TimelineWriterState::new( + layer, + initial_size, + last_freeze_at, + )); + + Ok(()) + } + + async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> { + let current_size = self.write_guard.as_ref().unwrap().current_size; + + // self.write_guard will be taken by the freezing + self.tl + .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) + .await; + + self.tl.flush_frozen_layers(freeze_at)?; + + if current_size >= self.get_checkpoint_distance() * 2 { + warn!("Flushed oversized open layer with size {}", current_size) + } + + Ok(()) + } + + fn get_open_layer_action(&self, lsn: Lsn, new_value_size: u64) -> OpenLayerAction { + let state = &*self.write_guard; + let Some(state) = &state else { + return OpenLayerAction::Open; + }; + + #[cfg(feature = "testing")] + if state.cached_last_freeze_at < self.tl.last_freeze_at.load() { + // this check and assertion are not really needed because + // LayerManager::try_freeze_in_memory_layer will always clear out the + // TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there + // is no TimelineWriterState. + assert!( + state.open_layer.end_lsn.get().is_some(), + "our open_layer must be outdated" + ); + + // this would be a memory leak waiting to happen because the in-memory layer always has + // an index + panic!("BUG: TimelineWriterState held on to frozen in-memory layer."); + } + + if state.prev_lsn == Some(lsn) { + // Rolling mid LSN is not supported by [downstream code]. + // Hence, only roll at LSN boundaries. + // + // [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422 + return OpenLayerAction::None; + } + + if state.current_size == 0 { + // Don't roll empty layers + return OpenLayerAction::None; + } + + if self.tl.should_roll( + state.current_size, + state.current_size + new_value_size, + self.get_checkpoint_distance(), + lsn, + state.cached_last_freeze_at, + state.open_layer.get_opened_at(), + ) { + OpenLayerAction::Roll + } else { + OpenLayerAction::None + } + } + + /// Put a batch of keys at the specified Lsns. + /// + /// The batch is sorted by Lsn (enforced by usage of [`utils::vec_map::VecMap`]. pub(crate) async fn put_batch( - &self, - batch: &HashMap>, + &mut self, + batch: VecMap, ctx: &RequestContext, ) -> anyhow::Result<()> { - self.tl.put_values(batch, ctx).await + for (lsn, (key, val)) in batch { + self.put(key, lsn, &val, ctx).await? + } + + Ok(()) } - pub(crate) async fn delete_batch(&self, batch: &[(Range, Lsn)]) -> anyhow::Result<()> { - self.tl.put_tombstones(batch).await + pub(crate) async fn delete_batch( + &mut self, + batch: &[(Range, Lsn)], + ctx: &RequestContext, + ) -> anyhow::Result<()> { + if let Some((_, lsn)) = batch.first() { + let action = self.get_open_layer_action(*lsn, 0); + let layer = self.handle_open_layer_action(*lsn, action, ctx).await?; + layer.put_tombstones(batch).await?; + } + + Ok(()) } /// Track the end of the latest digested WAL record. @@ -4611,26 +5919,6 @@ fn is_send() { _assert_send::>(); } -/// Add a suffix to a layer file's name: .{num}.old -/// Uses the first available num (starts at 0) -fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> { - let filename = path - .file_name() - .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?; - let mut new_path = path.to_owned(); - - for i in 0u32.. { - new_path.set_file_name(format!("{filename}.{i}.old")); - if !new_path.exists() { - std::fs::rename(path, &new_path) - .with_context(|| format!("rename {path:?} to {new_path:?}"))?; - return Ok(()); - } - } - - bail!("couldn't find an unused backup number for {:?}", path) -} - #[cfg(test)] mod tests { use utils::{id::TimelineId, lsn::Lsn}; @@ -4644,33 +5932,28 @@ mod tests { let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap(); - let ctx = any_context(); - let tenant = harness.try_load(&ctx).await.unwrap(); + let (tenant, ctx) = harness.load().await; let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) .await .unwrap(); - let rtc = timeline - .remote_client - .clone() - .expect("just configured this"); - let layer = find_some_layer(&timeline).await; let layer = layer .keep_resident() .await .expect("no download => no downloading errors") - .expect("should had been resident") .drop_eviction_guard(); - let first = async { layer.evict_and_wait(&rtc).await }; - let second = async { layer.evict_and_wait(&rtc).await }; + let forever = std::time::Duration::from_secs(120); + + let first = layer.evict_and_wait(forever); + let second = layer.evict_and_wait(forever); let (first, second) = tokio::join!(first, second); let res = layer.keep_resident().await; - assert!(matches!(res, Ok(None)), "{res:?}"); + assert!(res.is_none(), "{res:?}"); match (first, second) { (Ok(()), Ok(())) => { @@ -4684,12 +5967,6 @@ mod tests { } } - fn any_context() -> crate::context::RequestContext { - use crate::context::*; - use crate::task_mgr::*; - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) - } - async fn find_some_layer(timeline: &Timeline) -> Layer { let layers = timeline.layers.read().await; let desc = layers diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs new file mode 100644 index 0000000000..cd61418f3d --- /dev/null +++ b/pageserver/src/tenant/timeline/analysis.rs @@ -0,0 +1,90 @@ +use std::{collections::BTreeSet, ops::Range}; + +use utils::lsn::Lsn; + +use super::Timeline; + +#[derive(serde::Serialize)] +pub(crate) struct RangeAnalysis { + start: String, + end: String, + has_image: bool, + num_of_deltas_above_image: usize, + total_num_of_deltas: usize, +} + +impl Timeline { + pub(crate) async fn perf_info(&self) -> Vec { + // First, collect all split points of the layers. + let mut split_points = BTreeSet::new(); + let mut delta_ranges = Vec::new(); + let mut image_ranges = Vec::new(); + + let all_layer_files = { + let guard = self.layers.read().await; + guard.all_persistent_layers() + }; + let lsn = self.get_last_record_lsn(); + + for key in all_layer_files { + split_points.insert(key.key_range.start); + split_points.insert(key.key_range.end); + if key.is_delta { + delta_ranges.push((key.key_range.clone(), key.lsn_range.clone())); + } else { + image_ranges.push((key.key_range.clone(), key.lsn_range.start)); + } + } + + // For each split range, compute the estimated read amplification. + let split_points = split_points.into_iter().collect::>(); + + let mut result = Vec::new(); + + for i in 0..(split_points.len() - 1) { + let start = split_points[i]; + let end = split_points[i + 1]; + // Find the latest image layer that contains the information. + let mut maybe_image_layers = image_ranges + .iter() + // We insert split points for all image layers, and therefore a `contains` check for the start point should be enough. + .filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn) + .cloned() + .collect::>(); + maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1)); + let image_layer = maybe_image_layers.last().cloned(); + let lsn_filter_start = image_layer + .as_ref() + .map(|(_, lsn)| *lsn) + .unwrap_or(Lsn::INVALID); + + fn overlaps_with(lsn_range_a: &Range, lsn_range_b: &Range) -> bool { + !(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end) + } + + let maybe_delta_layers = delta_ranges + .iter() + .filter(|(key_range, lsn_range)| { + key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range) + }) + .cloned() + .collect::>(); + + let pitr_delta_layers = delta_ranges + .iter() + .filter(|(key_range, _)| key_range.contains(&start)) + .cloned() + .collect::>(); + + result.push(RangeAnalysis { + start: start.to_string(), + end: end.to_string(), + has_image: image_layer.is_some(), + num_of_deltas_above_image: maybe_delta_layers.len(), + total_num_of_deltas: pitr_delta_layers.len(), + }); + } + + result + } +} diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs new file mode 100644 index 0000000000..8a95029f33 --- /dev/null +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -0,0 +1,1493 @@ +//! New compaction implementation. The algorithm itself is implemented in the +//! compaction crate. This file implements the callbacks and structs that allow +//! the algorithm to drive the process. +//! +//! The old legacy algorithm is implemented directly in `timeline.rs`. + +use std::collections::BinaryHeap; +use std::ops::{Deref, Range}; +use std::sync::Arc; + +use super::layer_manager::LayerManager; +use super::{ + CompactFlags, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode, + RecordedDuration, Timeline, +}; + +use anyhow::{anyhow, Context}; +use enumset::EnumSet; +use fail::fail_point; +use itertools::Itertools; +use pageserver_api::keyspace::ShardedRange; +use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; +use tokio_util::sync::CancellationToken; +use tracing::{debug, info, info_span, trace, warn, Instrument}; +use utils::id::TimelineId; + +use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; +use crate::page_cache; +use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc}; +use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome}; +use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter}; +use crate::tenant::timeline::{Layer, ResidentLayer}; +use crate::tenant::DeltaLayer; +use crate::virtual_file::{MaybeFatalIo, VirtualFile}; + +use crate::keyspace::KeySpace; +use crate::repository::Key; + +use utils::lsn::Lsn; + +use pageserver_compaction::helpers::overlaps_with; +use pageserver_compaction::interface::*; + +use super::CompactionError; + +impl Timeline { + /// TODO: cancellation + pub(crate) async fn compact_legacy( + self: &Arc, + _cancel: &CancellationToken, + flags: EnumSet, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into roughly fixed-size chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This high level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + + // Is the timeline being deleted? + if self.is_stopping() { + trace!("Dropping out of compaction on timeline shutdown"); + return Err(CompactionError::ShuttingDown); + } + + let target_file_size = self.get_checkpoint_distance(); + + // Define partitioning schema if needed + + // FIXME: the match should only cover repartitioning, not the next steps + let partition_count = match self + .repartition( + self.get_last_record_lsn(), + self.get_compaction_target_size(), + flags, + ctx, + ) + .await + { + Ok(((dense_partitioning, sparse_partitioning), lsn)) => { + // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them + let image_ctx = RequestContextBuilder::extend(ctx) + .access_stats_behavior(AccessStatsBehavior::Skip) + .build(); + + // 2. Compact + let timer = self.metrics.compact_time_histo.start_timer(); + self.compact_level0(target_file_size, ctx).await?; + timer.stop_and_record(); + + // 3. Create new image layers for partitions that have been modified + // "enough". + let mut partitioning = dense_partitioning; + partitioning + .parts + .extend(sparse_partitioning.into_dense().parts); + let image_layers = self + .create_image_layers( + &partitioning, + lsn, + if flags.contains(CompactFlags::ForceImageLayerCreation) { + ImageLayerCreationMode::Force + } else { + ImageLayerCreationMode::Try + }, + &image_ctx, + ) + .await?; + + self.upload_new_image_layers(image_layers)?; + partitioning.parts.len() + } + Err(err) => { + // no partitioning? This is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline + // as a simple key-value store, ignoring the datadir layout. Log the + // error but continue. + // + // Suppress error when it's due to cancellation + if !self.cancel.is_cancelled() { + tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); + } + 1 + } + }; + + if self.shard_identity.count >= ShardCount::new(2) { + // Limit the number of layer rewrites to the number of partitions: this means its + // runtime should be comparable to a full round of image layer creations, rather than + // being potentially much longer. + let rewrite_max = partition_count; + + self.compact_shard_ancestors(rewrite_max, ctx).await?; + } + + Ok(()) + } + + /// Check for layers that are elegible to be rewritten: + /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that + /// we don't indefinitely retain keys in this shard that aren't needed. + /// - For future use: layers beyond pitr_interval that are in formats we would + /// rather not maintain compatibility with indefinitely. + /// + /// Note: this phase may read and write many gigabytes of data: use rewrite_max to bound + /// how much work it will try to do in each compaction pass. + async fn compact_shard_ancestors( + self: &Arc, + rewrite_max: usize, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let mut drop_layers = Vec::new(); + let mut layers_to_rewrite: Vec = Vec::new(); + + // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a + // layer is behind this Lsn, it indicates that the layer is being retained beyond the + // pitr_interval, for example because a branchpoint references it. + // + // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we + // are rewriting layers. + let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn(); + + tracing::info!( + "latest_gc_cutoff: {}, pitr cutoff {}", + *latest_gc_cutoff, + self.gc_info.read().unwrap().cutoffs.pitr + ); + + let layers = self.layers.read().await; + for layer_desc in layers.layer_map().iter_historic_layers() { + let layer = layers.get_from_desc(&layer_desc); + if layer.metadata().shard.shard_count == self.shard_identity.count { + // This layer does not belong to a historic ancestor, no need to re-image it. + continue; + } + + // This layer was created on an ancestor shard: check if it contains any data for this shard. + let sharded_range = ShardedRange::new(layer_desc.get_key_range(), &self.shard_identity); + let layer_local_page_count = sharded_range.page_count(); + let layer_raw_page_count = ShardedRange::raw_size(&layer_desc.get_key_range()); + if layer_local_page_count == 0 { + // This ancestral layer only covers keys that belong to other shards. + // We include the full metadata in the log: if we had some critical bug that caused + // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers. + info!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split, contains no keys for this shard.", + ); + + if cfg!(debug_assertions) { + // Expensive, exhaustive check of keys in this layer: this guards against ShardedRange's calculations being + // wrong. If ShardedRange claims the local page count is zero, then no keys in this layer + // should be !is_key_disposable() + let range = layer_desc.get_key_range(); + let mut key = range.start; + while key < range.end { + debug_assert!(self.shard_identity.is_key_disposable(&key)); + key = key.next(); + } + } + + drop_layers.push(layer); + continue; + } else if layer_local_page_count != u32::MAX + && layer_local_page_count == layer_raw_page_count + { + debug!(%layer, + "layer is entirely shard local ({} keys), no need to filter it", + layer_local_page_count + ); + continue; + } + + // Don't bother re-writing a layer unless it will at least halve its size + if layer_local_page_count != u32::MAX + && layer_local_page_count > layer_raw_page_count / 2 + { + debug!(%layer, + "layer is already mostly local ({}/{}), not rewriting", + layer_local_page_count, + layer_raw_page_count + ); + } + + // Don't bother re-writing a layer if it is within the PITR window: it will age-out eventually + // without incurring the I/O cost of a rewrite. + if layer_desc.get_lsn_range().end >= *latest_gc_cutoff { + debug!(%layer, "Skipping rewrite of layer still in GC window ({} >= {})", + layer_desc.get_lsn_range().end, *latest_gc_cutoff); + continue; + } + + if layer_desc.is_delta() { + // We do not yet implement rewrite of delta layers + debug!(%layer, "Skipping rewrite of delta layer"); + continue; + } + + // Only rewrite layers if their generations differ. This guarantees: + // - that local rewrite is safe, as local layer paths will differ between existing layer and rewritten one + // - that the layer is persistent in remote storage, as we only see old-generation'd layer via loading from remote storage + if layer.metadata().generation == self.generation { + debug!(%layer, "Skipping rewrite, is not from old generation"); + continue; + } + + if layers_to_rewrite.len() >= rewrite_max { + tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", + layers_to_rewrite.len() + ); + continue; + } + + // Fall through: all our conditions for doing a rewrite passed. + layers_to_rewrite.push(layer); + } + + // Drop read lock on layer map before we start doing time-consuming I/O + drop(layers); + + let mut replace_image_layers = Vec::new(); + + for layer in layers_to_rewrite { + tracing::info!(layer=%layer, "Rewriting layer after shard split..."); + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &layer.layer_desc().key_range, + layer.layer_desc().image_layer_lsn(), + ctx, + ) + .await?; + + // Safety of layer rewrites: + // - We are writing to a different local file path than we are reading from, so the old Layer + // cannot interfere with the new one. + // - In the page cache, contents for a particular VirtualFile are stored with a file_id that + // is different for two layers with the same name (in `ImageLayerInner::new` we always + // acquire a fresh id from [`crate::page_cache::next_file_id`]. So readers do not risk + // reading the index from one layer file, and then data blocks from the rewritten layer file. + // - Any readers that have a reference to the old layer will keep it alive until they are done + // with it. If they are trying to promote from remote storage, that will fail, but this is the same + // as for compaction generally: compaction is allowed to delete layers that readers might be trying to use. + // - We do not run concurrently with other kinds of compaction, so the only layer map writes we race with are: + // - GC, which at worst witnesses us "undelete" a layer that they just deleted. + // - ingestion, which only inserts layers, therefore cannot collide with us. + let resident = layer.download_and_keep_resident().await?; + + let keys_written = resident + .filter(&self.shard_identity, &mut image_layer_writer, ctx) + .await?; + + if keys_written > 0 { + let new_layer = image_layer_writer.finish(self, ctx).await?; + tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", + layer.metadata().file_size, + new_layer.metadata().file_size); + + replace_image_layers.push((layer, new_layer)); + } else { + // Drop the old layer. Usually for this case we would already have noticed that + // the layer has no data for us with the ShardedRange check above, but + drop_layers.push(layer); + } + } + + // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded + // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch + // to remote index) and be removed. This is inefficient but safe. + fail::fail_point!("compact-shard-ancestors-localonly"); + + // Update the LayerMap so that readers will use the new layers, and enqueue it for writing to remote storage + self.rewrite_layers(replace_image_layers, drop_layers) + .await?; + + fail::fail_point!("compact-shard-ancestors-enqueued"); + + // We wait for all uploads to complete before finishing this compaction stage. This is not + // necessary for correctness, but it simplifies testing, and avoids proceeding with another + // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O + // load. + self.remote_client.wait_completion().await?; + + fail::fail_point!("compact-shard-ancestors-persistent"); + + Ok(()) + } + + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as + /// as Level 1 files. + async fn compact_level0( + self: &Arc, + target_file_size: u64, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + let CompactLevel0Phase1Result { + new_layers, + deltas_to_compact, + } = { + let phase1_span = info_span!("compact_level0_phase1"); + let ctx = ctx.attached_child(); + let mut stats = CompactLevel0Phase1StatsBuilder { + version: Some(2), + tenant_id: Some(self.tenant_shard_id), + timeline_id: Some(self.timeline_id), + ..Default::default() + }; + + let begin = tokio::time::Instant::now(); + let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await; + let now = tokio::time::Instant::now(); + stats.read_lock_acquisition_micros = + DurationRecorder::Recorded(RecordedDuration(now - begin), now); + self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) + .instrument(phase1_span) + .await? + }; + + if new_layers.is_empty() && deltas_to_compact.is_empty() { + // nothing to do + return Ok(()); + } + + self.finish_compact_batch(&new_layers, &Vec::new(), &deltas_to_compact) + .await?; + Ok(()) + } + + /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. + async fn compact_level0_phase1( + self: &Arc, + guard: tokio::sync::OwnedRwLockReadGuard, + mut stats: CompactLevel0Phase1StatsBuilder, + target_file_size: u64, + ctx: &RequestContext, + ) -> Result { + stats.read_lock_held_spawn_blocking_startup_micros = + stats.read_lock_acquisition_micros.till_now(); // set by caller + let layers = guard.layer_map(); + let level0_deltas = layers.get_level0_deltas()?; + let mut level0_deltas = level0_deltas + .into_iter() + .map(|x| guard.get_from_desc(&x)) + .collect_vec(); + stats.level0_deltas_count = Some(level0_deltas.len()); + // Only compact if enough layers have accumulated. + let threshold = self.get_compaction_threshold(); + if level0_deltas.is_empty() || level0_deltas.len() < threshold { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); + } + + // Gather the files to compact in this iteration. + // + // Start with the oldest Level 0 delta file, and collect any other + // level 0 files that form a contiguous sequence, such that the end + // LSN of previous file matches the start LSN of the next file. + // + // Note that if the files don't form such a sequence, we might + // "compact" just a single file. That's a bit pointless, but it allows + // us to get rid of the level 0 file, and compact the other files on + // the next iteration. This could probably made smarter, but such + // "gaps" in the sequence of level 0 files should only happen in case + // of a crash, partial download from cloud storage, or something like + // that, so it's not a big deal in practice. + level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start); + let mut level0_deltas_iter = level0_deltas.iter(); + + let first_level0_delta = level0_deltas_iter.next().unwrap(); + let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end; + let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len()); + + deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?); + for l in level0_deltas_iter { + let lsn_range = &l.layer_desc().lsn_range; + + if lsn_range.start != prev_lsn_end { + break; + } + deltas_to_compact.push(l.download_and_keep_resident().await?); + prev_lsn_end = lsn_range.end; + } + let lsn_range = Range { + start: deltas_to_compact + .first() + .unwrap() + .layer_desc() + .lsn_range + .start, + end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end, + }; + + info!( + "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", + lsn_range.start, + lsn_range.end, + deltas_to_compact.len(), + level0_deltas.len() + ); + + for l in deltas_to_compact.iter() { + info!("compact includes {l}"); + } + + // We don't need the original list of layers anymore. Drop it so that + // we don't accidentally use it later in the function. + drop(level0_deltas); + + stats.read_lock_held_prerequisites_micros = stats + .read_lock_held_spawn_blocking_startup_micros + .till_now(); + + // Determine N largest holes where N is number of compacted layers. + let max_holes = deltas_to_compact.len(); + let last_record_lsn = self.get_last_record_lsn(); + let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; + let min_hole_coverage_size = 3; // TODO: something more flexible? + + // min-heap (reserve space for one more element added before eviction) + let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); + let mut prev: Option = None; + + let mut all_keys = Vec::new(); + + for l in deltas_to_compact.iter() { + all_keys.extend(l.load_keys(ctx).await?); + } + + // FIXME: should spawn_blocking the rest of this function + + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + + stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); + + for &DeltaEntry { key: next_key, .. } in all_keys.iter() { + if let Some(prev_key) = prev { + // just first fast filter, do not create hole entries for metadata keys. The last hole in the + // compaction is the gap between data key and metadata keys. + if next_key.to_i128() - prev_key.to_i128() >= min_hole_range + && !Key::is_metadata_key(&prev_key) + { + let key_range = prev_key..next_key; + // Measuring hole by just subtraction of i128 representation of key range boundaries + // has not so much sense, because largest holes will corresponds field1/field2 changes. + // But we are mostly interested to eliminate holes which cause generation of excessive image layers. + // That is why it is better to measure size of hole as number of covering image layers. + let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len(); + if coverage_size >= min_hole_coverage_size { + heap.push(Hole { + key_range, + coverage_size, + }); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } + } + } + } + prev = Some(next_key.next()); + } + stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); + drop_rlock(guard); + stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now(); + let mut holes = heap.into_vec(); + holes.sort_unstable_by_key(|hole| hole.key_range.start); + let mut next_hole = 0; // index of next hole in holes vector + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let all_values_iter = all_keys.iter(); + + // This iterator walks through all keys and is needed to calculate size used by each key + let mut all_keys_iter = all_keys + .iter() + .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size)) + .coalesce(|mut prev, cur| { + // Coalesce keys that belong to the same key pair. + // This ensures that compaction doesn't put them + // into different layer files. + // Still limit this by the target file size, + // so that we keep the size of the files in + // check. + if prev.0 == cur.0 && prev.2 < target_file_size { + prev.2 += cur.2; + Ok(prev) + } else { + Err((prev, cur)) + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers, based on the current partitioning. + // + // We split the new delta layers on the key dimension. We iterate through the key space, and for each key, check if including the next key to the current output layer we're building would cause the layer to become too large. If so, dump the current output layer and start new one. + // It's possible that there is a single key with so many page versions that storing all of them in a single layer file + // would be too large. In that case, we also split on the LSN dimension. + // + // LSN + // ^ + // | + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | | | | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // + // + // If one key (X) has a lot of page versions: + // + // LSN + // ^ + // | (X) + // | +-----------+ +--+--+--+--+ + // | | | | | | | | + // | +-----------+ | | +--+ | + // | | | | | | | | + // | +-----------+ ==> | | | | | + // | | | | | +--+ | + // | +-----------+ | | | | | + // | | | | | | | | + // | +-----------+ +--+--+--+--+ + // | + // +--------------> key + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + let mut key_values_total_size = 0u64; + let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key + let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key + + for &DeltaEntry { + key, lsn, ref val, .. + } in all_values_iter + { + let value = val.load(ctx).await?; + let same_key = prev_key.map_or(false, |prev_key| prev_key == key); + // We need to check key boundaries once we reach next key or end of layer with the same key + if !same_key || lsn == dup_end_lsn { + let mut next_key_size = 0u64; + let is_dup_layer = dup_end_lsn.is_valid(); + dup_start_lsn = Lsn::INVALID; + if !same_key { + dup_end_lsn = Lsn::INVALID; + } + // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size + for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() { + next_key_size = next_size; + if key != next_key { + if dup_end_lsn.is_valid() { + // We are writting segment with duplicates: + // place all remaining values of this key in separate segment + dup_start_lsn = dup_end_lsn; // new segments starts where old stops + dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range + } + break; + } + key_values_total_size += next_size; + // Check if it is time to split segment: if total keys size is larger than target file size. + // We need to avoid generation of empty segments if next_size > target_file_size. + if key_values_total_size > target_file_size && lsn != next_lsn { + // Split key between multiple layers: such layer can contain only single key + dup_start_lsn = if dup_end_lsn.is_valid() { + dup_end_lsn // new segment with duplicates starts where old one stops + } else { + lsn // start with the first LSN for this key + }; + dup_end_lsn = next_lsn; // upper LSN boundary is exclusive + break; + } + } + // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set. + if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() { + dup_start_lsn = dup_end_lsn; + dup_end_lsn = lsn_range.end; + } + if writer.is_some() { + let written_size = writer.as_mut().unwrap().size(); + let contains_hole = + next_hole < holes.len() && key >= holes[next_hole].key_range.end; + // check if key cause layer overflow or contains hole... + if is_dup_layer + || dup_end_lsn.is_valid() + || written_size + key_values_total_size > target_file_size + || contains_hole + { + // ... if so, flush previous layer and prepare to write new one + new_layers.push( + writer + .take() + .unwrap() + .finish(prev_key.unwrap().next(), self, ctx) + .await?, + ); + writer = None; + + if contains_hole { + // skip hole + next_hole += 1; + } + } + } + // Remember size of key value because at next iteration we will access next item + key_values_total_size = next_key_size; + } + fail_point!("delta-layer-writer-fail-before-finish", |_| { + Err(CompactionError::Other(anyhow::anyhow!( + "failpoint delta-layer-writer-fail-before-finish" + ))) + }); + + if !self.shard_identity.is_key_disposable(&key) { + if writer.is_none() { + // Create writer if not initiaized yet + writer = Some( + DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + key, + if dup_end_lsn.is_valid() { + // this is a layer containing slice of values of the same key + debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn); + dup_start_lsn..dup_end_lsn + } else { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + lsn_range.clone() + }, + ctx, + ) + .await?, + ); + } + + writer + .as_mut() + .unwrap() + .put_value(key, lsn, value, ctx) + .await?; + } else { + debug!( + "Dropping key {} during compaction (it belongs on shard {:?})", + key, + self.shard_identity.get_shard_number(&key) + ); + } + + if !new_layers.is_empty() { + fail_point!("after-timeline-compacted-first-L1"); + } + + prev_key = Some(key); + } + if let Some(writer) = writer { + new_layers.push(writer.finish(prev_key.unwrap().next(), self, ctx).await?); + } + + // Sync layers + if !new_layers.is_empty() { + // Print a warning if the created layer is larger than double the target size + // Add two pages for potential overhead. This should in theory be already + // accounted for in the target calculation, but for very small targets, + // we still might easily hit the limit otherwise. + let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2; + for layer in new_layers.iter() { + if layer.layer_desc().file_size > warn_limit { + warn!( + %layer, + "created delta file of size {} larger than double of target of {target_file_size}", layer.layer_desc().file_size + ); + } + } + + // The writer.finish() above already did the fsync of the inodes. + // We just need to fsync the directory in which these inodes are linked, + // which we know to be the timeline directory. + // + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. + let timeline_dir = VirtualFile::open( + &self + .conf + .timeline_path(&self.tenant_shard_id, &self.timeline_id), + ctx, + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + + stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now(); + stats.new_deltas_count = Some(new_layers.len()); + stats.new_deltas_size = Some(new_layers.iter().map(|l| l.layer_desc().file_size).sum()); + + match TryInto::::try_into(stats) + .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string")) + { + Ok(stats_json) => { + info!( + stats_json = stats_json.as_str(), + "compact_level0_phase1 stats available" + ) + } + Err(e) => { + warn!("compact_level0_phase1 stats failed to serialize: {:#}", e); + } + } + + Ok(CompactLevel0Phase1Result { + new_layers, + deltas_to_compact: deltas_to_compact + .into_iter() + .map(|x| x.drop_eviction_guard()) + .collect::>(), + }) + } +} + +#[derive(Default)] +struct CompactLevel0Phase1Result { + new_layers: Vec, + deltas_to_compact: Vec, +} + +#[derive(Default)] +struct CompactLevel0Phase1StatsBuilder { + version: Option, + tenant_id: Option, + timeline_id: Option, + read_lock_acquisition_micros: DurationRecorder, + read_lock_held_spawn_blocking_startup_micros: DurationRecorder, + read_lock_held_key_sort_micros: DurationRecorder, + read_lock_held_prerequisites_micros: DurationRecorder, + read_lock_held_compute_holes_micros: DurationRecorder, + read_lock_drop_micros: DurationRecorder, + write_layer_files_micros: DurationRecorder, + level0_deltas_count: Option, + new_deltas_count: Option, + new_deltas_size: Option, +} + +#[derive(serde::Serialize)] +struct CompactLevel0Phase1Stats { + version: u64, + tenant_id: TenantShardId, + timeline_id: TimelineId, + read_lock_acquisition_micros: RecordedDuration, + read_lock_held_spawn_blocking_startup_micros: RecordedDuration, + read_lock_held_key_sort_micros: RecordedDuration, + read_lock_held_prerequisites_micros: RecordedDuration, + read_lock_held_compute_holes_micros: RecordedDuration, + read_lock_drop_micros: RecordedDuration, + write_layer_files_micros: RecordedDuration, + level0_deltas_count: usize, + new_deltas_count: usize, + new_deltas_size: u64, +} + +impl TryFrom for CompactLevel0Phase1Stats { + type Error = anyhow::Error; + + fn try_from(value: CompactLevel0Phase1StatsBuilder) -> Result { + Ok(Self { + version: value.version.ok_or_else(|| anyhow!("version not set"))?, + tenant_id: value + .tenant_id + .ok_or_else(|| anyhow!("tenant_id not set"))?, + timeline_id: value + .timeline_id + .ok_or_else(|| anyhow!("timeline_id not set"))?, + read_lock_acquisition_micros: value + .read_lock_acquisition_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, + read_lock_held_spawn_blocking_startup_micros: value + .read_lock_held_spawn_blocking_startup_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, + read_lock_held_key_sort_micros: value + .read_lock_held_key_sort_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, + read_lock_held_prerequisites_micros: value + .read_lock_held_prerequisites_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, + read_lock_held_compute_holes_micros: value + .read_lock_held_compute_holes_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_held_compute_holes_micros not set"))?, + read_lock_drop_micros: value + .read_lock_drop_micros + .into_recorded() + .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?, + write_layer_files_micros: value + .write_layer_files_micros + .into_recorded() + .ok_or_else(|| anyhow!("write_layer_files_micros not set"))?, + level0_deltas_count: value + .level0_deltas_count + .ok_or_else(|| anyhow!("level0_deltas_count not set"))?, + new_deltas_count: value + .new_deltas_count + .ok_or_else(|| anyhow!("new_deltas_count not set"))?, + new_deltas_size: value + .new_deltas_size + .ok_or_else(|| anyhow!("new_deltas_size not set"))?, + }) + } +} + +impl Timeline { + /// Entry point for new tiered compaction algorithm. + /// + /// All the real work is in the implementation in the pageserver_compaction + /// crate. The code here would apply to any algorithm implemented by the + /// same interface, but tiered is the only one at the moment. + /// + /// TODO: cancellation + pub(crate) async fn compact_tiered( + self: &Arc, + _cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + let fanout = self.get_compaction_threshold() as u64; + let target_file_size = self.get_checkpoint_distance(); + + // Find the top of the historical layers + let end_lsn = { + let guard = self.layers.read().await; + let layers = guard.layer_map(); + + let l0_deltas = layers.get_level0_deltas()?; + drop(guard); + + // As an optimization, if we find that there are too few L0 layers, + // bail out early. We know that the compaction algorithm would do + // nothing in that case. + if l0_deltas.len() < fanout as usize { + // doesn't need compacting + return Ok(()); + } + l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap() + }; + + // Is the timeline being deleted? + if self.is_stopping() { + trace!("Dropping out of compaction on timeline shutdown"); + return Err(CompactionError::ShuttingDown); + } + + let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?; + // TODO(chi): ignore sparse_keyspace for now, compact it in the future. + let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks)); + + pageserver_compaction::compact_tiered::compact_tiered( + &mut adaptor, + end_lsn, + target_file_size, + fanout, + ctx, + ) + .await?; + + adaptor.flush_updates().await?; + Ok(()) + } + + /// An experimental compaction building block that combines compaction with garbage collection. + /// + /// The current implementation picks all delta + image layers that are below or intersecting with + /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta + /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon, + /// and create delta layers with all deltas >= gc horizon. + #[cfg(test)] + pub(crate) async fn compact_with_gc( + self: &Arc, + _cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + use crate::tenant::storage_layer::ValueReconstructState; + // Step 0: pick all delta layers + image layers below/intersect with the GC horizon. + // The layer selection has the following properties: + // 1. If a layer is in the selection, all layers below it are in the selection. + // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. + let (layer_selection, gc_cutoff) = { + let guard = self.layers.read().await; + let layers = guard.layer_map(); + let gc_info = self.gc_info.read().unwrap(); + let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr); + let mut selected_layers = Vec::new(); + // TODO: consider retain_lsns + drop(gc_info); + for desc in layers.iter_historic_layers() { + if desc.get_lsn_range().start <= gc_cutoff { + selected_layers.push(guard.get_from_desc(&desc)); + } + } + (selected_layers, gc_cutoff) + }; + // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. + let mut all_key_values = Vec::new(); + for layer in &layer_selection { + all_key_values.extend(layer.load_key_values(ctx).await?); + } + // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and + // image layers, make image appear later than delta. + struct ValueWrapper<'a>(&'a crate::repository::Value); + impl Ord for ValueWrapper<'_> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use crate::repository::Value; + use std::cmp::Ordering; + match (self.0, other.0) { + (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater, + (Value::WalRecord(_), Value::Image(_)) => Ordering::Less, + _ => Ordering::Equal, + } + } + } + impl PartialOrd for ValueWrapper<'_> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + impl PartialEq for ValueWrapper<'_> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == std::cmp::Ordering::Equal + } + } + impl Eq for ValueWrapper<'_> {} + all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| { + (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2))) + }); + let max_lsn = all_key_values + .iter() + .map(|(_, lsn, _)| lsn) + .max() + .copied() + .unwrap() + + 1; + // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas. + // Data of the same key. + let mut accumulated_values = Vec::new(); + let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty + + /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon. + async fn flush_accumulated_states( + tline: &Arc, + key: Key, + accumulated_values: &[&(Key, Lsn, crate::repository::Value)], + horizon: Lsn, + ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> { + let mut base_image = None; + let mut keys_above_horizon = Vec::new(); + let mut delta_above_base_image = Vec::new(); + // We have a list of deltas/images. We want to create image layers while collect garbages. + for (key, lsn, val) in accumulated_values.iter().rev() { + if *lsn > horizon { + keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both + } else if *lsn <= horizon { + match val { + crate::repository::Value::Image(image) => { + if lsn <= &horizon { + base_image = Some((*lsn, image.clone())); + break; + } + } + crate::repository::Value::WalRecord(wal) => { + delta_above_base_image.push((*lsn, wal.clone())); + } + } + } + } + delta_above_base_image.reverse(); + keys_above_horizon.reverse(); + let state = ValueReconstructState { + img: base_image, + records: delta_above_base_image, + }; + let img = tline.reconstruct_value(key, horizon, state).await?; + Ok((keys_above_horizon, img)) + } + + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + all_key_values.first().unwrap().0, + gc_cutoff..max_lsn, // TODO: off by one? + ctx, + ) + .await?; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()), + gc_cutoff, + ctx, + ) + .await?; + + for item @ (key, _, _) in &all_key_values { + if &last_key == key { + accumulated_values.push(item); + } else { + let (deltas, image) = + flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff) + .await?; + image_layer_writer.put_image(last_key, image, ctx).await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + accumulated_values.clear(); + accumulated_values.push(item); + last_key = *key; + } + } + let (deltas, image) = + flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?; + image_layer_writer.put_image(last_key, image, ctx).await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + accumulated_values.clear(); + // TODO: split layers + let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?; + let image_layer = image_layer_writer.finish(self, ctx).await?; + // Step 3: Place back to the layer map. + { + let mut guard = self.layers.write().await; + guard.finish_gc_compaction( + &layer_selection, + &[delta_layer.clone(), image_layer.clone()], + &self.metrics, + ) + }; + Ok(()) + } +} + +struct TimelineAdaptor { + timeline: Arc, + + keyspace: (Lsn, KeySpace), + + new_deltas: Vec, + new_images: Vec, + layers_to_delete: Vec>, +} + +impl TimelineAdaptor { + pub fn new(timeline: &Arc, keyspace: (Lsn, KeySpace)) -> Self { + Self { + timeline: timeline.clone(), + keyspace, + new_images: Vec::new(), + new_deltas: Vec::new(), + layers_to_delete: Vec::new(), + } + } + + pub async fn flush_updates(&mut self) -> anyhow::Result<()> { + let layers_to_delete = { + let guard = self.timeline.layers.read().await; + self.layers_to_delete + .iter() + .map(|x| guard.get_from_desc(x)) + .collect::>() + }; + self.timeline + .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete) + .await?; + + self.timeline + .upload_new_image_layers(std::mem::take(&mut self.new_images))?; + + self.new_deltas.clear(); + self.layers_to_delete.clear(); + Ok(()) + } +} + +#[derive(Clone)] +struct ResidentDeltaLayer(ResidentLayer); +#[derive(Clone)] +struct ResidentImageLayer(ResidentLayer); + +impl CompactionJobExecutor for TimelineAdaptor { + type Key = crate::repository::Key; + + type Layer = OwnArc; + type DeltaLayer = ResidentDeltaLayer; + type ImageLayer = ResidentImageLayer; + + type RequestContext = crate::context::RequestContext; + + fn get_shard_identity(&self) -> &ShardIdentity { + self.timeline.get_shard_identity() + } + + async fn get_layers( + &mut self, + key_range: &Range, + lsn_range: &Range, + _ctx: &RequestContext, + ) -> anyhow::Result>> { + self.flush_updates().await?; + + let guard = self.timeline.layers.read().await; + let layer_map = guard.layer_map(); + + let result = layer_map + .iter_historic_layers() + .filter(|l| { + overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range) + }) + .map(OwnArc) + .collect(); + Ok(result) + } + + async fn get_keyspace( + &mut self, + key_range: &Range, + lsn: Lsn, + _ctx: &RequestContext, + ) -> anyhow::Result>> { + if lsn == self.keyspace.0 { + Ok(pageserver_compaction::helpers::intersect_keyspace( + &self.keyspace.1.ranges, + key_range, + )) + } else { + // The current compaction implementatin only ever requests the key space + // at the compaction end LSN. + anyhow::bail!("keyspace not available for requested lsn"); + } + } + + async fn downcast_delta_layer( + &self, + layer: &OwnArc, + ) -> anyhow::Result> { + // this is a lot more complex than a simple downcast... + if layer.is_delta() { + let l = { + let guard = self.timeline.layers.read().await; + guard.get_from_desc(layer) + }; + let result = l.download_and_keep_resident().await?; + + Ok(Some(ResidentDeltaLayer(result))) + } else { + Ok(None) + } + } + + async fn create_image( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + Ok(self.create_image_impl(lsn, key_range, ctx).await?) + } + + async fn create_delta( + &mut self, + lsn_range: &Range, + key_range: &Range, + input_layers: &[ResidentDeltaLayer], + ctx: &RequestContext, + ) -> anyhow::Result<()> { + debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); + + let mut all_entries = Vec::new(); + for dl in input_layers.iter() { + all_entries.extend(dl.load_keys(ctx).await?); + } + + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn)); + + let mut writer = DeltaLayerWriter::new( + self.timeline.conf, + self.timeline.timeline_id, + self.timeline.tenant_shard_id, + key_range.start, + lsn_range.clone(), + ctx, + ) + .await?; + + let mut dup_values = 0; + + // This iterator walks through all key-value pairs from all the layers + // we're compacting, in key, LSN order. + let mut prev: Option<(Key, Lsn)> = None; + for &DeltaEntry { + key, lsn, ref val, .. + } in all_entries.iter() + { + if prev == Some((key, lsn)) { + // This is a duplicate. Skip it. + // + // It can happen if compaction is interrupted after writing some + // layers but not all, and we are compacting the range again. + // The calculations in the algorithm assume that there are no + // duplicates, so the math on targeted file size is likely off, + // and we will create smaller files than expected. + dup_values += 1; + continue; + } + + let value = val.load(ctx).await?; + + writer.put_value(key, lsn, value, ctx).await?; + + prev = Some((key, lsn)); + } + + if dup_values > 0 { + warn!("delta layer created with {} duplicate values", dup_values); + } + + fail_point!("delta-layer-writer-fail-before-finish", |_| { + Err(anyhow::anyhow!( + "failpoint delta-layer-writer-fail-before-finish" + )) + }); + + let new_delta_layer = writer + .finish(prev.unwrap().0.next(), &self.timeline, ctx) + .await?; + + self.new_deltas.push(new_delta_layer); + Ok(()) + } + + async fn delete_layer( + &mut self, + layer: &OwnArc, + _ctx: &RequestContext, + ) -> anyhow::Result<()> { + self.layers_to_delete.push(layer.clone().0); + Ok(()) + } +} + +impl TimelineAdaptor { + async fn create_image_impl( + &mut self, + lsn: Lsn, + key_range: &Range, + ctx: &RequestContext, + ) -> Result<(), CreateImageLayersError> { + let timer = self.timeline.metrics.create_images_time_histo.start_timer(); + + let image_layer_writer = ImageLayerWriter::new( + self.timeline.conf, + self.timeline.timeline_id, + self.timeline.tenant_shard_id, + key_range, + lsn, + ctx, + ) + .await?; + + fail_point!("image-layer-writer-fail-before-finish", |_| { + Err(CreateImageLayersError::Other(anyhow::anyhow!( + "failpoint image-layer-writer-fail-before-finish" + ))) + }); + + let keyspace = KeySpace { + ranges: self.get_keyspace(key_range, lsn, ctx).await?, + }; + // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly + let start = Key::MIN; + let ImageLayerCreationOutcome { + image, + next_start_key: _, + } = self + .timeline + .create_image_layer_for_rel_blocks( + &keyspace, + image_layer_writer, + lsn, + ctx, + key_range.clone(), + start, + ) + .await?; + + if let Some(image_layer) = image { + self.new_images.push(image_layer); + } + + timer.stop_and_record(); + + Ok(()) + } +} + +impl CompactionRequestContext for crate::context::RequestContext {} + +#[derive(Debug, Clone)] +pub struct OwnArc(pub Arc); + +impl Deref for OwnArc { + type Target = as Deref>::Target; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl AsRef for OwnArc { + fn as_ref(&self) -> &T { + self.0.as_ref() + } +} + +impl CompactionLayer for OwnArc { + fn key_range(&self) -> &Range { + &self.key_range + } + fn lsn_range(&self) -> &Range { + &self.lsn_range + } + fn file_size(&self) -> u64 { + self.file_size + } + fn short_id(&self) -> std::string::String { + self.as_ref().short_id().to_string() + } + fn is_delta(&self) -> bool { + self.as_ref().is_delta() + } +} + +impl CompactionLayer for OwnArc { + fn key_range(&self) -> &Range { + &self.layer_desc().key_range + } + fn lsn_range(&self) -> &Range { + &self.layer_desc().lsn_range + } + fn file_size(&self) -> u64 { + self.layer_desc().file_size + } + fn short_id(&self) -> std::string::String { + self.layer_desc().short_id().to_string() + } + fn is_delta(&self) -> bool { + true + } +} + +use crate::tenant::timeline::DeltaEntry; + +impl CompactionLayer for ResidentDeltaLayer { + fn key_range(&self) -> &Range { + &self.0.layer_desc().key_range + } + fn lsn_range(&self) -> &Range { + &self.0.layer_desc().lsn_range + } + fn file_size(&self) -> u64 { + self.0.layer_desc().file_size + } + fn short_id(&self) -> std::string::String { + self.0.layer_desc().short_id().to_string() + } + fn is_delta(&self) -> bool { + true + } +} + +impl CompactionDeltaLayer for ResidentDeltaLayer { + type DeltaEntry<'a> = DeltaEntry<'a>; + + async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result>> { + self.0.load_keys(ctx).await + } +} + +impl CompactionLayer for ResidentImageLayer { + fn key_range(&self) -> &Range { + &self.0.layer_desc().key_range + } + fn lsn_range(&self) -> &Range { + &self.0.layer_desc().lsn_range + } + fn file_size(&self) -> u64 { + self.0.layer_desc().file_size + } + fn short_id(&self) -> std::string::String { + self.0.layer_desc().short_id().to_string() + } + fn is_delta(&self) -> bool { + false + } +} +impl CompactionImageLayer for ResidentImageLayer {} diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index be873181d9..441298f3e9 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -6,106 +6,40 @@ use std::{ use anyhow::Context; use pageserver_api::{models::TimelineState, shard::TenantShardId}; use tokio::sync::OwnedMutexGuard; -use tracing::{debug, error, info, instrument, warn, Instrument, Span}; -use utils::{crashsafe, fs_ext, id::TimelineId}; +use tracing::{error, info, instrument, Instrument}; +use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint}; use crate::{ config::PageServerConf, - deletion_queue::DeletionQueueClient, task_mgr::{self, TaskKind}, tenant::{ - debug_assert_current_span_has_tenant_and_timeline_id, metadata::TimelineMetadata, - remote_timeline_client::{ - self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, - }, + remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, CreateTimelineCause, DeleteTimelineError, Tenant, }, }; use super::{Timeline, TimelineResources}; -/// Now that the Timeline is in Stopping state, request all the related tasks to shut down. -async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - // Notify any timeline work to drop out of loops/requests - tracing::debug!("Cancelling CancellationToken"); - timeline.cancel.cancel(); - - // Stop the walreceiver first. - debug!("waiting for wal receiver to shutdown"); - let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; - if let Some(walreceiver) = maybe_started_walreceiver { - walreceiver.stop().await; - } - debug!("wal receiver shutdown confirmed"); - - // Shut down the layer flush task before the remote client, as one depends on the other - task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - // Prevent new uploads from starting. - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.stop(); - match res { - Ok(()) => {} - Err(e) => match e { - remote_timeline_client::StopError::QueueUninitialized => { - // This case shouldn't happen currently because the - // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. - // That is, before we declare the Tenant as Active. - // But we only allow calls to delete_timeline on Active tenants. - return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); - } - }, - } - } - - // Stop & wait for the remaining timeline tasks, including upload tasks. - // NB: This and other delete_timeline calls do not run as a task_mgr task, - // so, they are not affected by this shutdown_tasks() call. - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks( - None, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { - Err(anyhow::anyhow!( - "failpoint: timeline-delete-before-index-deleted-at" - ))? - }); - - tracing::debug!("Waiting for gate..."); - timeline.gate.close().await; - tracing::debug!("Shutdown complete"); - - Ok(()) -} - /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - if let Some(remote_client) = timeline.remote_client.as_ref() { - match remote_client.persist_index_part_with_deleted_flag().await { - // If we (now, or already) marked it successfully as deleted, we can proceed - Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), - // Bail out otherwise - // - // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents - // two tasks from performing the deletion at the same time. The first task - // that starts deletion should run it to completion. - Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) - | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { - return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); - } + match timeline + .remote_client + .persist_index_part_with_deleted_flag() + .await + { + // If we (now, or already) marked it successfully as deleted, we can proceed + Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), + // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. + Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) + | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { + return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); } } Ok(()) @@ -124,7 +58,7 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi /// No timeout here, GC & Compaction should be responsive to the /// `TimelineState::Stopping` change. // pub(super): documentation link -pub(super) async fn delete_local_layer_files( +pub(super) async fn delete_local_timeline_directory( conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline: &Timeline, @@ -149,8 +83,6 @@ pub(super) async fn delete_local_layer_files( // NB: This need not be atomic because the deleted flag in the IndexPart // will be observed during tenant/timeline load. The deletion will be resumed there. // - // For configurations without remote storage, we guarantee crash-safety by persising delete mark file. - // // Note that here we do not bail out on std::io::ErrorKind::NotFound. // This can happen if we're called a second time, e.g., // because of a previous failure/cancellation at/after @@ -158,72 +90,21 @@ pub(super) async fn delete_local_layer_files( // // ErrorKind::NotFound can also happen if we race with tenant detach, because, // no locks are shared. - // - // For now, log and continue. - // warn! level is technically not appropriate for the - // first case because we should expect retries to happen. - // But the error is so rare, it seems better to get attention if it happens. - // - // Note that metadata removal is skipped, this is not technically needed, - // but allows to reuse timeline loading code during resumed deletion. - // (we always expect that metadata is in place when timeline is being loaded) + tokio::fs::remove_dir_all(local_timeline_directory) + .await + .or_else(fs_ext::ignore_not_found) + .context("remove local timeline directory")?; - #[cfg(feature = "testing")] - let mut counter = 0; - - // Timeline directory may not exist if we failed to delete mark file and request was retried. - if !local_timeline_directory.exists() { - return Ok(()); - } - - let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id); - - for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) { - #[cfg(feature = "testing")] - { - counter += 1; - if counter == 2 { - fail::fail_point!("timeline-delete-during-rm", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))? - }); - } - } - - let entry = entry?; - if entry.path() == metadata_path { - debug!("found metadata, skipping"); - continue; - } - - if entry.path() == local_timeline_directory { - // Keeping directory because metedata file is still there - debug!("found timeline dir itself, skipping"); - continue; - } - - let metadata = match entry.metadata() { - Ok(metadata) => metadata, - Err(e) => { - if crate::is_walkdir_io_not_found(&e) { - warn!( - timeline_dir=?local_timeline_directory, - path=?entry.path().display(), - "got not found err while removing timeline dir, proceeding anyway" - ); - continue; - } - anyhow::bail!(e); - } - }; - - if metadata.is_dir() { - warn!(path=%entry.path().display(), "unexpected directory under timeline dir"); - tokio::fs::remove_dir(entry.path()).await - } else { - tokio::fs::remove_file(entry.path()).await - } - .with_context(|| format!("Failed to remove: {}", entry.path().display()))?; - } + // Make sure previous deletions are ordered before mark removal. + // Otherwise there is no guarantee that they reach the disk before mark deletion. + // So its possible for mark to reach disk first and for other deletions + // to be reordered later and thus missed if a crash occurs. + // Note that we dont need to sync after mark file is removed + // because we can tolerate the case when mark file reappears on startup. + let timeline_path = conf.timelines_path(&tenant_shard_id); + crashsafe::fsync_async(timeline_path) + .await + .context("fsync_pre_mark_remove")?; info!("finished deleting layer files, releasing locks"); drop(guards); @@ -237,11 +118,11 @@ pub(super) async fn delete_local_layer_files( /// Removes remote layers and an index file after them. async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> { - if let Some(remote_client) = &timeline.remote_client { - remote_client.delete_all().await.context("delete_all")? - }; - - Ok(()) + timeline + .remote_client + .delete_all() + .await + .context("delete_all") } // This function removs remaining traces of a timeline on disk. @@ -254,39 +135,6 @@ async fn cleanup_remaining_timeline_fs_traces( tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> anyhow::Result<()> { - // Remove local metadata - tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id)) - .await - .or_else(fs_ext::ignore_not_found) - .context("remove metadata")?; - - fail::fail_point!("timeline-delete-after-rm-metadata", |_| { - Err(anyhow::anyhow!( - "failpoint: timeline-delete-after-rm-metadata" - ))? - }); - - // Remove timeline dir - tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id)) - .await - .or_else(fs_ext::ignore_not_found) - .context("timeline dir")?; - - fail::fail_point!("timeline-delete-after-rm-dir", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))? - }); - - // Make sure previous deletions are ordered before mark removal. - // Otherwise there is no guarantee that they reach the disk before mark deletion. - // So its possible for mark to reach disk first and for other deletions - // to be reordered later and thus missed if a crash occurs. - // Note that we dont need to sync after mark file is removed - // because we can tolerate the case when mark file reappears on startup. - let timeline_path = conf.timelines_path(&tenant_shard_id); - crashsafe::fsync_async(timeline_path) - .await - .context("fsync_pre_mark_remove")?; - // Remove delete mark // TODO: once we are confident that no more exist in the field, remove this // line. It cleans up a legacy marker file that might in rare cases be present. @@ -356,17 +204,26 @@ impl DeleteTimelineFlow { // NB: If this fails half-way through, and is retried, the retry will go through // all the same steps again. Make sure the code here is idempotent, and don't // error out if some of the shutdown tasks have already been completed! - #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))] + #[instrument(skip_all, fields(%inplace))] pub async fn run( tenant: &Arc, timeline_id: TimelineId, inplace: bool, ) -> Result<(), DeleteTimelineError> { + super::debug_assert_current_span_has_tenant_and_timeline_id(); + let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?; guard.mark_in_progress()?; - stop_tasks(&timeline).await?; + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. + timeline.shutdown(super::ShutdownMode::Hard).await; + + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-index-deleted-at" + ))? + }); set_deleted_in_remote_index(&timeline).await?; @@ -404,8 +261,7 @@ impl DeleteTimelineFlow { tenant: Arc, timeline_id: TimelineId, local_metadata: &TimelineMetadata, - remote_client: Option, - deletion_queue_client: DeletionQueueClient, + remote_client: RemoteTimelineClient, ) -> anyhow::Result<()> { // Note: here we even skip populating layer map. Timeline is essentially uninitialized. // RemoteTimelineClient is the only functioning part. @@ -416,11 +272,13 @@ impl DeleteTimelineFlow { None, // Ancestor is not needed for deletion. TimelineResources { remote_client, - deletion_queue_client, + timeline_get_throttle: tenant.timeline_get_throttle.clone(), }, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. CreateTimelineCause::Delete, + // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace + None, ) .context("create_timeline_struct")?; @@ -539,12 +397,7 @@ impl DeleteTimelineFlow { }; Ok(()) } - .instrument({ - let span = - tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id); - span.follows_from(Span::current()); - span - }), + .instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)), ); } @@ -554,15 +407,12 @@ impl DeleteTimelineFlow { tenant: &Tenant, timeline: &Timeline, ) -> Result<(), DeleteTimelineError> { - delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?; + delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?; delete_remote_layers_and_index(timeline).await?; pausable_failpoint!("in_progress_delete"); - cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id) - .await?; - remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; *guard = Self::Finished; @@ -573,6 +423,10 @@ impl DeleteTimelineFlow { pub(crate) fn is_finished(&self) -> bool { matches!(self, Self::Finished) } + + pub(crate) fn is_not_started(&self) -> bool { + matches!(self, Self::NotStarted) + } } struct DeletionGuard(OwnedMutexGuard); diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs new file mode 100644 index 0000000000..4fc89330ba --- /dev/null +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -0,0 +1,555 @@ +use std::sync::Arc; + +use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; +use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + tenant::{ + storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, + Tenant, + }, + virtual_file::{MaybeFatalIo, VirtualFile}, +}; +use tokio_util::sync::CancellationToken; +use tracing::Instrument; +use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; + +#[derive(Debug, thiserror::Error)] +pub(crate) enum Error { + #[error("no ancestors")] + NoAncestor, + #[error("too many ancestors")] + TooManyAncestors, + #[error("shutting down, please retry later")] + ShuttingDown, + #[error("flushing failed")] + FlushAncestor(#[source] FlushLayerError), + #[error("layer download failed")] + RewrittenDeltaDownloadFailed(#[source] anyhow::Error), + #[error("copying LSN prefix locally failed")] + CopyDeltaPrefix(#[source] anyhow::Error), + #[error("upload rewritten layer")] + UploadRewritten(#[source] anyhow::Error), + + #[error("ancestor is already being detached by: {}", .0)] + OtherTimelineDetachOngoing(TimelineId), + + #[error("remote copying layer failed")] + CopyFailed(#[source] anyhow::Error), + + #[error("unexpected error")] + Unexpected(#[source] anyhow::Error), +} + +impl From for ApiError { + fn from(value: Error) -> Self { + match value { + e @ Error::NoAncestor => ApiError::Conflict(e.to_string()), + // TODO: ApiError converts the anyhow using debug formatting ... just stop using ApiError? + e @ Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", e)), + Error::ShuttingDown => ApiError::ShuttingDown, + Error::OtherTimelineDetachOngoing(_) => { + ApiError::ResourceUnavailable("other timeline detach is already ongoing".into()) + } + // All of these contain shutdown errors, in fact, it's the most common + e @ Error::FlushAncestor(_) + | e @ Error::RewrittenDeltaDownloadFailed(_) + | e @ Error::CopyDeltaPrefix(_) + | e @ Error::UploadRewritten(_) + | e @ Error::CopyFailed(_) + | e @ Error::Unexpected(_) => ApiError::InternalServerError(e.into()), + } + } +} + +pub(crate) struct PreparedTimelineDetach { + layers: Vec, +} + +/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. +#[derive(Debug)] +pub(crate) struct Options { + pub(crate) rewrite_concurrency: std::num::NonZeroUsize, + pub(crate) copy_concurrency: std::num::NonZeroUsize, +} + +impl Default for Options { + fn default() -> Self { + Self { + rewrite_concurrency: std::num::NonZeroUsize::new(2).unwrap(), + copy_concurrency: std::num::NonZeroUsize::new(100).unwrap(), + } + } +} + +/// See [`Timeline::prepare_to_detach_from_ancestor`] +pub(super) async fn prepare( + detached: &Arc, + tenant: &Tenant, + options: Options, + ctx: &RequestContext, +) -> Result<(completion::Completion, PreparedTimelineDetach), Error> { + use Error::*; + + let Some((ancestor, ancestor_lsn)) = detached + .ancestor_timeline + .as_ref() + .map(|tl| (tl.clone(), detached.ancestor_lsn)) + else { + // TODO: check if we have already been detached; for this we need to read the stored data + // on remote client, for that we need a follow-up which makes uploads cheaper and maintains + // a projection of the commited data. + // + // the error is wrong per openapi + return Err(NoAncestor); + }; + + if !ancestor_lsn.is_valid() { + return Err(NoAncestor); + } + + if ancestor.ancestor_timeline.is_some() { + // non-technical requirement; we could flatten N ancestors just as easily but we chose + // not to, at least initially + return Err(TooManyAncestors); + } + + // before we acquire the gate, we must mark the ancestor as having a detach operation + // ongoing which will block other concurrent detach operations so we don't get to ackward + // situations where there would be two branches trying to reparent earlier branches. + let (guard, barrier) = completion::channel(); + + { + let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); + if let Some((tl, other)) = guard.as_ref() { + if !other.is_ready() { + return Err(OtherTimelineDetachOngoing(*tl)); + } + } + *guard = Some((detached.timeline_id, barrier)); + } + + let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + + if ancestor_lsn >= ancestor.get_disk_consistent_lsn() { + let span = + tracing::info_span!("freeze_and_flush", ancestor_timeline_id=%ancestor.timeline_id); + async { + let started_at = std::time::Instant::now(); + let freeze_and_flush = ancestor.freeze_and_flush0(); + let mut freeze_and_flush = std::pin::pin!(freeze_and_flush); + + let res = + tokio::time::timeout(std::time::Duration::from_secs(1), &mut freeze_and_flush) + .await; + + let res = match res { + Ok(res) => res, + Err(_elapsed) => { + tracing::info!("freezing and flushing ancestor is still ongoing"); + freeze_and_flush.await + } + }; + + res.map_err(FlushAncestor)?; + + // we do not need to wait for uploads to complete but we do need `struct Layer`, + // copying delta prefix is unsupported currently for `InMemoryLayer`. + tracing::info!( + elapsed_ms = started_at.elapsed().as_millis(), + "froze and flushed the ancestor" + ); + Ok(()) + } + .instrument(span) + .await?; + } + + let end_lsn = ancestor_lsn + 1; + + let (filtered_layers, straddling_branchpoint, rest_of_historic) = { + // we do not need to start from our layers, because they can only be layers that come + // *after* ancestor_lsn + let layers = tokio::select! { + guard = ancestor.layers.read() => guard, + _ = detached.cancel.cancelled() => { + return Err(ShuttingDown); + } + _ = ancestor.cancel.cancelled() => { + return Err(ShuttingDown); + } + }; + + // between retries, these can change if compaction or gc ran in between. this will mean + // we have to redo work. + partition_work(ancestor_lsn, &layers) + }; + + // TODO: layers are already sorted by something: use that to determine how much of remote + // copies are already done. + tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers"); + + // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after + let mut new_layers: Vec = + Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len()); + + { + tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers"); + + let mut tasks = tokio::task::JoinSet::new(); + + let mut wrote_any = false; + + let limiter = Arc::new(tokio::sync::Semaphore::new( + options.rewrite_concurrency.get(), + )); + + for layer in straddling_branchpoint { + let limiter = limiter.clone(); + let timeline = detached.clone(); + let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download); + + tasks.spawn(async move { + let _permit = limiter.acquire().await; + let copied = + upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) + .await?; + Ok(copied) + }); + } + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Ok(Some(copied))) => { + wrote_any = true; + tracing::info!(layer=%copied, "rewrote and uploaded"); + new_layers.push(copied); + } + Ok(Ok(None)) => {} + Ok(Err(e)) => return Err(e), + Err(je) => return Err(Unexpected(je.into())), + } + } + + // FIXME: the fsync should be mandatory, after both rewrites and copies + if wrote_any { + let timeline_dir = VirtualFile::open( + &detached + .conf + .timeline_path(&detached.tenant_shard_id, &detached.timeline_id), + ctx, + ) + .await + .fatal_err("VirtualFile::open for timeline dir fsync"); + timeline_dir + .sync_all() + .await + .fatal_err("VirtualFile::sync_all timeline dir"); + } + } + + let mut tasks = tokio::task::JoinSet::new(); + let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get())); + + for adopted in rest_of_historic { + let limiter = limiter.clone(); + let timeline = detached.clone(); + + tasks.spawn( + async move { + let _permit = limiter.acquire().await; + let owned = + remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?; + tracing::info!(layer=%owned, "remote copied"); + Ok(owned) + } + .in_current_span(), + ); + } + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Ok(owned)) => { + new_layers.push(owned); + } + Ok(Err(failed)) => { + return Err(failed); + } + Err(je) => return Err(Unexpected(je.into())), + } + } + + // TODO: fsync directory again if we hardlinked something + + let prepared = PreparedTimelineDetach { layers: new_layers }; + + Ok((guard, prepared)) +} + +fn partition_work( + ancestor_lsn: Lsn, + source_layermap: &LayerManager, +) -> (usize, Vec, Vec) { + let mut straddling_branchpoint = vec![]; + let mut rest_of_historic = vec![]; + + let mut later_by_lsn = 0; + + for desc in source_layermap.layer_map().iter_historic_layers() { + // off by one chances here: + // - start is inclusive + // - end is exclusive + if desc.lsn_range.start > ancestor_lsn { + later_by_lsn += 1; + continue; + } + + let target = if desc.lsn_range.start <= ancestor_lsn + && desc.lsn_range.end > ancestor_lsn + && desc.is_delta + { + // TODO: image layer at Lsn optimization + &mut straddling_branchpoint + } else { + &mut rest_of_historic + }; + + target.push(source_layermap.get_from_desc(&desc)); + } + + (later_by_lsn, straddling_branchpoint, rest_of_historic) +} + +async fn upload_rewritten_layer( + end_lsn: Lsn, + layer: &Layer, + target: &Arc, + cancel: &CancellationToken, + ctx: &RequestContext, +) -> Result, Error> { + use Error::UploadRewritten; + let copied = copy_lsn_prefix(end_lsn, layer, target, ctx).await?; + + let Some(copied) = copied else { + return Ok(None); + }; + + // FIXME: better shuttingdown error + target + .remote_client + .upload_layer_file(&copied, cancel) + .await + .map_err(UploadRewritten)?; + + Ok(Some(copied.into())) +} + +async fn copy_lsn_prefix( + end_lsn: Lsn, + layer: &Layer, + target_timeline: &Arc, + ctx: &RequestContext, +) -> Result, Error> { + use Error::{CopyDeltaPrefix, RewrittenDeltaDownloadFailed}; + + tracing::debug!(%layer, %end_lsn, "copying lsn prefix"); + + let mut writer = DeltaLayerWriter::new( + target_timeline.conf, + target_timeline.timeline_id, + target_timeline.tenant_shard_id, + layer.layer_desc().key_range.start, + layer.layer_desc().lsn_range.start..end_lsn, + ctx, + ) + .await + .map_err(CopyDeltaPrefix)?; + + let resident = layer + .download_and_keep_resident() + .await + // likely shutdown + .map_err(RewrittenDeltaDownloadFailed)?; + + let records = resident + .copy_delta_prefix(&mut writer, end_lsn, ctx) + .await + .map_err(CopyDeltaPrefix)?; + + drop(resident); + + tracing::debug!(%layer, records, "copied records"); + + if records == 0 { + drop(writer); + // TODO: we might want to store an empty marker in remote storage for this + // layer so that we will not needlessly walk `layer` on repeated attempts. + Ok(None) + } else { + // reuse the key instead of adding more holes between layers by using the real + // highest key in the layer. + let reused_highest_key = layer.layer_desc().key_range.end; + let copied = writer + .finish(reused_highest_key, target_timeline, ctx) + .await + .map_err(CopyDeltaPrefix)?; + + tracing::debug!(%layer, %copied, "new layer produced"); + + Ok(Some(copied)) + } +} + +/// Creates a new Layer instance for the adopted layer, and ensures it is found from the remote +/// storage on successful return without the adopted layer being added to `index_part.json`. +async fn remote_copy( + adopted: &Layer, + adoptee: &Arc, + generation: Generation, + cancel: &CancellationToken, +) -> Result { + use Error::CopyFailed; + + // depending if Layer::keep_resident we could hardlink + + let mut metadata = adopted.metadata(); + debug_assert!(metadata.generation <= generation); + metadata.generation = generation; + + let owned = crate::tenant::storage_layer::Layer::for_evicted( + adoptee.conf, + adoptee, + adopted.layer_desc().layer_name(), + metadata, + ); + + // FIXME: better shuttingdown error + adoptee + .remote_client + .copy_timeline_layer(adopted, &owned, cancel) + .await + .map(move |()| owned) + .map_err(CopyFailed) +} + +/// See [`Timeline::complete_detaching_timeline_ancestor`]. +pub(super) async fn complete( + detached: &Arc, + tenant: &Tenant, + prepared: PreparedTimelineDetach, + _ctx: &RequestContext, +) -> Result, anyhow::Error> { + let PreparedTimelineDetach { layers } = prepared; + + let ancestor = detached + .get_ancestor_timeline() + .expect("must still have a ancestor"); + let ancestor_lsn = detached.get_ancestor_lsn(); + + // publish the prepared layers before we reparent any of the timelines, so that on restart + // reparented timelines find layers. also do the actual detaching. + // + // if we crash after this operation, we will at least come up having detached a timeline, but + // we cannot go back and reparent the timelines which would had been reparented in normal + // execution. + // + // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart + // which could give us a completely wrong layer combination. + detached + .remote_client + .schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await?; + + let mut tasks = tokio::task::JoinSet::new(); + + // because we are now keeping the slot in progress, it is unlikely that there will be any + // timeline deletions during this time. if we raced one, then we'll just ignore it. + tenant + .timelines + .lock() + .unwrap() + .values() + .filter_map(|tl| { + if Arc::ptr_eq(tl, detached) { + return None; + } + + if !tl.is_active() { + return None; + } + + let tl_ancestor = tl.ancestor_timeline.as_ref()?; + let is_same = Arc::ptr_eq(&ancestor, tl_ancestor); + let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; + + let is_deleting = tl + .delete_progress + .try_lock() + .map(|flow| !flow.is_not_started()) + .unwrap_or(true); + + if is_same && is_earlier && !is_deleting { + Some(tl.clone()) + } else { + None + } + }) + .for_each(|timeline| { + // important in this scope: we are holding the Tenant::timelines lock + let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); + let new_parent = detached.timeline_id; + + tasks.spawn( + async move { + let res = timeline + .remote_client + .schedule_reparenting_and_wait(&new_parent) + .await; + + match res { + Ok(()) => Some(timeline), + Err(e) => { + // with the use of tenant slot, we no longer expect these. + tracing::warn!("reparenting failed: {e:#}"); + None + } + } + } + .instrument(span), + ); + }); + + let reparenting_candidates = tasks.len(); + let mut reparented = Vec::with_capacity(tasks.len()); + + while let Some(res) = tasks.join_next().await { + match res { + Ok(Some(timeline)) => { + tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); + reparented.push(timeline.timeline_id); + } + Ok(None) => { + // lets just ignore this for now. one or all reparented timelines could had + // started deletion, and that is fine. + } + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // ignore; it's better to continue with a single reparenting failing (or even + // all of them) in order to get to the goal state. + // + // these timelines will never be reparentable, but they can be always detached as + // separate tree roots. + } + Err(je) => tracing::error!("unexpected join error: {je:?}"), + } + } + + if reparenting_candidates != reparented.len() { + tracing::info!("failed to reparent some candidates"); + } + + Ok(reparented) +} diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index ea5f5f5fa7..8a8c38d0ce 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -20,23 +20,21 @@ use std::{ time::{Duration, SystemTime}, }; +use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}; use tokio::time::Instant; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, info_span, instrument, warn, Instrument}; +use tracing::{debug, info, info_span, instrument, warn, Instrument}; use crate::{ context::{DownloadBehavior, RequestContext}, pgdatadir_mapping::CollectKeySpaceError, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, tenant::{ - config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, - tasks::BackgroundLoopKind, - timeline::EvictionError, - LogicalSizeCalculationCause, Tenant, + tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, }, }; -use utils::completion; +use utils::{completion, sync::gate::GateGuard}; use super::Timeline; @@ -53,6 +51,7 @@ pub struct EvictionTaskTenantState { impl Timeline { pub(super) fn launch_eviction_task( self: &Arc, + parent: Arc, background_tasks_can_start: Option<&completion::Barrier>, ) { let self_clone = Arc::clone(self); @@ -68,28 +67,34 @@ impl Timeline { ), false, async move { - let cancel = task_mgr::shutdown_token(); tokio::select! { - _ = cancel.cancelled() => { return Ok(()); } + _ = self_clone.cancel.cancelled() => { return Ok(()); } _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {} }; - self_clone.eviction_task(cancel).await; + self_clone.eviction_task(parent).await; Ok(()) }, ); } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] - async fn eviction_task(self: Arc, cancel: CancellationToken) { + async fn eviction_task(self: Arc, tenant: Arc) { use crate::tenant::tasks::random_init_delay; + + // acquire the gate guard only once within a useful span + let Ok(guard) = self.gate.enter() else { + return; + }; + { let policy = self.get_eviction_policy(); let period = match policy { EvictionPolicy::LayerAccessThreshold(lat) => lat.period, + EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; - if random_init_delay(period, &cancel).await.is_err() { + if random_init_delay(period, &self.cancel).await.is_err() { return; } } @@ -97,12 +102,14 @@ impl Timeline { let ctx = RequestContext::new(TaskKind::Eviction, DownloadBehavior::Warn); loop { let policy = self.get_eviction_policy(); - let cf = self.eviction_iteration(&policy, &cancel, &ctx).await; + let cf = self + .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx) + .await; match cf { ControlFlow::Break(()) => break, ControlFlow::Continue(sleep_until) => { - if tokio::time::timeout_at(sleep_until, cancel.cancelled()) + if tokio::time::timeout_at(sleep_until, self.cancel.cancelled()) .await .is_ok() { @@ -116,95 +123,84 @@ impl Timeline { #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))] async fn eviction_iteration( self: &Arc, + tenant: &Tenant, policy: &EvictionPolicy, cancel: &CancellationToken, + gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<(), Instant> { debug!("eviction iteration: {policy:?}"); - match policy { + let start = Instant::now(); + let (period, threshold) = match policy { EvictionPolicy::NoEviction => { // check again in 10 seconds; XXX config watch mechanism - ControlFlow::Continue(Instant::now() + Duration::from_secs(10)) + return ControlFlow::Continue(Instant::now() + Duration::from_secs(10)); } EvictionPolicy::LayerAccessThreshold(p) => { - let start = Instant::now(); - match self.eviction_iteration_threshold(p, cancel, ctx).await { + match self + .eviction_iteration_threshold(tenant, p, cancel, gate, ctx) + .await + { ControlFlow::Break(()) => return ControlFlow::Break(()), ControlFlow::Continue(()) => (), } - let elapsed = start.elapsed(); - crate::tenant::tasks::warn_when_period_overrun( - elapsed, - p.period, - BackgroundLoopKind::Eviction, - ); - crate::metrics::EVICTION_ITERATION_DURATION - .get_metric_with_label_values(&[ - &format!("{}", p.period.as_secs()), - &format!("{}", p.threshold.as_secs()), - ]) - .unwrap() - .observe(elapsed.as_secs_f64()); - ControlFlow::Continue(start + p.period) + (p.period, p.threshold) } - } + EvictionPolicy::OnlyImitiate(p) => { + if self + .imitiate_only(tenant, p, cancel, gate, ctx) + .await + .is_break() + { + return ControlFlow::Break(()); + } + (p.period, p.threshold) + } + }; + + let elapsed = start.elapsed(); + crate::tenant::tasks::warn_when_period_overrun( + elapsed, + period, + BackgroundLoopKind::Eviction, + ); + // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I + // don't think that is a relevant fear however, and regardless the imitation should be the + // most costly part. + crate::metrics::EVICTION_ITERATION_DURATION + .get_metric_with_label_values(&[ + &format!("{}", period.as_secs()), + &format!("{}", threshold.as_secs()), + ]) + .unwrap() + .observe(elapsed.as_secs_f64()); + + ControlFlow::Continue(start + period) } async fn eviction_iteration_threshold( self: &Arc, + tenant: &Tenant, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, + gate: &GateGuard, ctx: &RequestContext, ) -> ControlFlow<()> { let now = SystemTime::now(); - let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( - BackgroundLoopKind::Eviction, - ctx, - ); + let permit = self.acquire_imitation_permit(cancel, ctx).await?; - let _permit = tokio::select! { - permit = acquire_permit => permit, - _ = cancel.cancelled() => return ControlFlow::Break(()), - _ = self.cancel.cancelled() => return ControlFlow::Break(()), - }; + self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) + .await?; - // If we evict layers but keep cached values derived from those layers, then - // we face a storm of on-demand downloads after pageserver restart. - // The reason is that the restart empties the caches, and so, the values - // need to be re-computed by accessing layers, which we evicted while the - // caches were filled. - // - // Solutions here would be one of the following: - // 1. Have a persistent cache. - // 2. Count every access to a cached value to the access stats of all layers - // that were accessed to compute the value in the first place. - // 3. Invalidate the caches at a period of < p.threshold/2, so that the values - // get re-computed from layers, thereby counting towards layer access stats. - // 4. Make the eviction task imitate the layer accesses that typically hit caches. - // - // We follow approach (4) here because in Neon prod deployment: - // - page cache is quite small => high churn => low hit rate - // => eviction gets correct access stats - // - value-level caches such as logical size & repatition have a high hit rate, - // especially for inactive tenants - // => eviction sees zero accesses for these - // => they cause the on-demand download storm on pageserver restart - // - // We should probably move to persistent caches in the future, or avoid - // having inactive tenants attached to pageserver in the first place. - match self.imitate_layer_accesses(p, cancel, ctx).await { - ControlFlow::Break(()) => return ControlFlow::Break(()), - ControlFlow::Continue(()) => (), - } - - #[allow(dead_code)] #[derive(Debug, Default)] struct EvictionStats { candidates: usize, evicted: usize, errors: usize, not_evictable: usize, + timeouts: usize, + #[allow(dead_code)] skipped_for_shutdown: usize, } @@ -215,41 +211,22 @@ impl Timeline { // So, we just need to deal with this. - let remote_client = match self.remote_client.as_ref() { - Some(c) => c, - None => { - error!("no remote storage configured, cannot evict layers"); - return ControlFlow::Continue(()); - } - }; - let mut js = tokio::task::JoinSet::new(); { let guard = self.layers.read().await; let layers = guard.layer_map(); - for hist_layer in layers.iter_historic_layers() { - let hist_layer = guard.get_from_desc(&hist_layer); + for layer in layers.iter_historic_layers() { + let layer = guard.get_from_desc(&layer); // guard against eviction while we inspect it; it might be that eviction_task and // disk_usage_eviction_task both select the same layers to be evicted, and // seemingly free up double the space. both succeeding is of no consequence. - let guard = match hist_layer.keep_resident().await { - Ok(Some(l)) => l, - Ok(None) => continue, - Err(e) => { - // these should not happen, but we cannot make them statically impossible right - // now. - tracing::warn!(layer=%hist_layer, "failed to keep the layer resident: {e:#}"); - continue; - } - }; - let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| { - // We only use this fallback if there's an implementation error. - // `latest_activity` already does rate-limited warn!() log. - debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now"); - SystemTime::now() - }); + if !layer.is_likely_resident() { + continue; + } + + let last_activity_ts = layer.access_stats().latest_activity_or_now(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, @@ -272,11 +249,13 @@ impl Timeline { continue; } }; - let layer = guard.drop_eviction_guard(); + if no_activity_for > p.threshold { - let remote_client = remote_client.clone(); - // this could cause a lot of allocations in some cases - js.spawn(async move { layer.evict_and_wait(&remote_client).await }); + js.spawn(async move { + layer + .evict_and_wait(std::time::Duration::from_secs(5)) + .await + }); stats.candidates += 1; } } @@ -289,6 +268,9 @@ impl Timeline { Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => { stats.not_evictable += 1; } + Ok(Err(EvictionError::Timeout)) => { + stats.timeouts += 1; + } Err(je) if je.is_cancelled() => unreachable!("not used"), Err(je) if je.is_panic() => { /* already logged */ @@ -304,7 +286,8 @@ impl Timeline { stats = join_all => { if stats.candidates == stats.not_evictable { debug!(stats=?stats, "eviction iteration complete"); - } else if stats.errors > 0 || stats.not_evictable > 0 { + } else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 { + // reminder: timeouts are not eviction cancellations warn!(stats=?stats, "eviction iteration complete"); } else { info!(stats=?stats, "eviction iteration complete"); @@ -318,13 +301,80 @@ impl Timeline { ControlFlow::Continue(()) } + /// Like `eviction_iteration_threshold`, but without any eviction. Eviction will be done by + /// disk usage based eviction task. + async fn imitiate_only( + self: &Arc, + tenant: &Tenant, + p: &EvictionPolicyLayerAccessThreshold, + cancel: &CancellationToken, + gate: &GateGuard, + ctx: &RequestContext, + ) -> ControlFlow<()> { + let permit = self.acquire_imitation_permit(cancel, ctx).await?; + + self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx) + .await + } + + async fn acquire_imitation_permit( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> { + let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit( + BackgroundLoopKind::Eviction, + ctx, + ); + + tokio::select! { + permit = acquire_permit => ControlFlow::Continue(permit), + _ = cancel.cancelled() => ControlFlow::Break(()), + _ = self.cancel.cancelled() => ControlFlow::Break(()), + } + } + + /// If we evict layers but keep cached values derived from those layers, then + /// we face a storm of on-demand downloads after pageserver restart. + /// The reason is that the restart empties the caches, and so, the values + /// need to be re-computed by accessing layers, which we evicted while the + /// caches were filled. + /// + /// Solutions here would be one of the following: + /// 1. Have a persistent cache. + /// 2. Count every access to a cached value to the access stats of all layers + /// that were accessed to compute the value in the first place. + /// 3. Invalidate the caches at a period of < p.threshold/2, so that the values + /// get re-computed from layers, thereby counting towards layer access stats. + /// 4. Make the eviction task imitate the layer accesses that typically hit caches. + /// + /// We follow approach (4) here because in Neon prod deployment: + /// - page cache is quite small => high churn => low hit rate + /// => eviction gets correct access stats + /// - value-level caches such as logical size & repatition have a high hit rate, + /// especially for inactive tenants + /// => eviction sees zero accesses for these + /// => they cause the on-demand download storm on pageserver restart + /// + /// We should probably move to persistent caches in the future, or avoid + /// having inactive tenants attached to pageserver in the first place. #[instrument(skip_all)] async fn imitate_layer_accesses( &self, + tenant: &Tenant, p: &EvictionPolicyLayerAccessThreshold, cancel: &CancellationToken, + gate: &GateGuard, + permit: tokio::sync::SemaphorePermit<'static>, ctx: &RequestContext, ) -> ControlFlow<()> { + if !self.tenant_shard_id.is_shard_zero() { + // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size + // for consumption metrics (consumption metrics are only sent from shard 0). We may therefore + // skip imitating logical size accesses for eviction purposes. + return ControlFlow::Continue(()); + } + let mut state = self.eviction_task_timeline_state.lock().await; // Only do the imitate_layer accesses approximately as often as the threshold. A little @@ -334,7 +384,7 @@ impl Timeline { match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { - self.imitate_timeline_cached_layer_accesses(ctx).await; + self.imitate_timeline_cached_layer_accesses(gate, ctx).await; state.last_layer_access_imitation = Some(tokio::time::Instant::now()) } } @@ -348,17 +398,32 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) { - Ok(t) => t, - Err(_) => { - return ControlFlow::Break(()); + let (mut state, _permit) = { + if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() { + (locked, permit) + } else { + // we might need to wait for a long time here in case of pathological synthetic + // size calculation performance + drop(permit); + let locked = tokio::select! { + locked = tenant.eviction_task_tenant_state.lock() => locked, + _ = self.cancel.cancelled() => { + return ControlFlow::Break(()) + }, + _ = cancel.cancelled() => { + return ControlFlow::Break(()) + } + }; + // then reacquire -- this will be bad if there is a lot of traffic, but because we + // released the permit, the overall latency will be much better. + let permit = self.acquire_imitation_permit(cancel, ctx).await?; + (locked, permit) } }; - let mut state = tenant.eviction_task_tenant_state.lock().await; match state.last_layer_access_imitation { Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { - self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx) + self.imitate_synthetic_size_calculation_worker(tenant, cancel, ctx) .await; state.last_layer_access_imitation = Some(tokio::time::Instant::now()); } @@ -374,12 +439,21 @@ impl Timeline { /// Recompute the values which would cause on-demand downloads during restart. #[instrument(skip_all)] - async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) { + async fn imitate_timeline_cached_layer_accesses( + &self, + guard: &GateGuard, + ctx: &RequestContext, + ) { let lsn = self.get_last_record_lsn(); // imitiate on-restart initial logical size let size = self - .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx) + .calculate_logical_size( + lsn, + LogicalSizeCalculationCause::EvictionTaskImitation, + guard, + ctx, + ) .instrument(info_span!("calculate_logical_size")) .await; @@ -423,7 +497,7 @@ impl Timeline { #[instrument(skip_all)] async fn imitate_synthetic_size_calculation_worker( &self, - tenant: &Arc, + tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext, ) { diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs index 916ebfc6d9..5bc67c7133 100644 --- a/pageserver/src/tenant/timeline/init.rs +++ b/pageserver/src/tenant/timeline/init.rs @@ -6,31 +6,29 @@ use crate::{ self, index::{IndexPart, LayerFileMetadata}, }, - storage_layer::LayerFileName, - Generation, + storage_layer::LayerName, }, - METADATA_FILE_NAME, }; use anyhow::Context; -use camino::Utf8Path; -use pageserver_api::shard::ShardIndex; -use std::{collections::HashMap, str::FromStr}; +use camino::{Utf8Path, Utf8PathBuf}; +use std::{ + collections::{hash_map, HashMap}, + str::FromStr, +}; use utils::lsn::Lsn; /// Identified files in the timeline directory. pub(super) enum Discovered { /// The only one we care about - Layer(LayerFileName, u64), + Layer(LayerName, LocalLayerFileMetadata), /// Old ephmeral files from previous launches, should be removed Ephemeral(String), /// Old temporary timeline files, unsure what these really are, should be removed Temporary(String), /// Temporary on-demand download files, should be removed TemporaryDownload(String), - /// "metadata" file we persist locally and include in `index_part.json` - Metadata, /// Backup file from previously future layers - IgnoredBackup, + IgnoredBackup(Utf8PathBuf), /// Unrecognized, warn about these Unknown(String), } @@ -43,17 +41,18 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result { let file_size = direntry.metadata()?.len(); - Discovered::Layer(file_name, file_size) + Discovered::Layer( + file_name, + LocalLayerFileMetadata::new(direntry.path().to_owned(), file_size), + ) } Err(_) => { - if file_name == METADATA_FILE_NAME { - Discovered::Metadata - } else if file_name.ends_with(".old") { + if file_name.ends_with(".old") { // ignore these - Discovered::IgnoredBackup + Discovered::IgnoredBackup(direntry.path().to_owned()) } else if remote_timeline_client::is_temp_download_file(direntry.path()) { Discovered::TemporaryDownload(file_name) } else if is_ephemeral_file(&file_name) { @@ -72,19 +71,36 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result Self { + Self { + local_path, + file_size, + } + } +} + +/// For a layer that is present in remote metadata, this type describes how to handle +/// it during startup: it is either Resident (and we have some metadata about a local file), +/// or it is Evicted (and we only have remote metadata). #[derive(Clone, Debug)] pub(super) enum Decision { /// The layer is not present locally. Evicted(LayerFileMetadata), - /// The layer is present locally, but local metadata does not match remote; we must - /// delete it and treat it as evicted. - UseRemote { - local: LayerFileMetadata, + /// The layer is present locally, and metadata matches: we may hook up this layer to the + /// existing file in local storage. + Resident { + local: LocalLayerFileMetadata, remote: LayerFileMetadata, }, - /// The layer is present locally, and metadata matches. - UseLocal(LayerFileMetadata), } /// A layer needs to be left out of the layer map. @@ -92,82 +108,89 @@ pub(super) enum Decision { pub(super) enum DismissedLayer { /// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded. Future { - /// The local metadata. `None` if the layer is only known through [`IndexPart`]. - local: Option, + /// `None` if the layer is only known through [`IndexPart`]. + local: Option, }, /// The layer only exists locally. /// /// In order to make crash safe updates to layer map, we must dismiss layers which are only /// found locally or not yet included in the remote `index_part.json`. - LocalOnly(LayerFileMetadata), + LocalOnly(LocalLayerFileMetadata), + + /// The layer exists in remote storage but the local layer's metadata (e.g. file size) + /// does not match it + BadMetadata(LocalLayerFileMetadata), } /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions. pub(super) fn reconcile( - discovered: Vec<(LayerFileName, u64)>, + local_layers: Vec<(LayerName, LocalLayerFileMetadata)>, index_part: Option<&IndexPart>, disk_consistent_lsn: Lsn, - generation: Generation, - shard: ShardIndex, -) -> Vec<(LayerFileName, Result)> { - use Decision::*; +) -> Vec<(LayerName, Result)> { + let Some(index_part) = index_part else { + // If we have no remote metadata, no local layer files are considered valid to load + return local_layers + .into_iter() + .map(|(layer_name, local_metadata)| { + (layer_name, Err(DismissedLayer::LocalOnly(local_metadata))) + }) + .collect(); + }; - // name => (local, remote) - type Collected = HashMap, Option)>; + let mut result = Vec::new(); - let mut discovered = discovered - .into_iter() - .map(|(name, file_size)| { - ( - name, - // The generation and shard here will be corrected to match IndexPart in the merge below, unless - // it is not in IndexPart, in which case using our current generation makes sense - // because it will be uploaded in this generation. - ( - Some(LayerFileMetadata::new(file_size, generation, shard)), - None, - ), - ) - }) - .collect::(); + let mut remote_layers = HashMap::new(); - // merge any index_part information, when available + // Construct Decisions for layers that are found locally, if they're in remote metadata. Otherwise + // construct DismissedLayers to get rid of them. + for (layer_name, local_metadata) in local_layers { + let Some(remote_metadata) = index_part.layer_metadata.get(&layer_name) else { + result.push((layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))); + continue; + }; + + if remote_metadata.file_size != local_metadata.file_size { + result.push((layer_name, Err(DismissedLayer::BadMetadata(local_metadata)))); + continue; + } + + remote_layers.insert( + layer_name, + Decision::Resident { + local: local_metadata, + remote: remote_metadata.clone(), + }, + ); + } + + // Construct Decision for layers that were not found locally index_part - .as_ref() - .map(|ip| ip.layer_metadata.iter()) - .into_iter() - .flatten() - .map(|(name, metadata)| (name, LayerFileMetadata::from(metadata))) + .layer_metadata + .iter() .for_each(|(name, metadata)| { - if let Some(existing) = discovered.get_mut(name) { - existing.1 = Some(metadata); - } else { - discovered.insert(name.to_owned(), (None, Some(metadata))); + if let hash_map::Entry::Vacant(entry) = remote_layers.entry(name.clone()) { + entry.insert(Decision::Evicted(metadata.clone())); } }); - discovered - .into_iter() - .map(|(name, (local, remote))| { - let decision = if name.is_in_future(disk_consistent_lsn) { - Err(DismissedLayer::Future { local }) - } else { - match (local, remote) { - (Some(local), Some(remote)) if local != remote => { - Ok(UseRemote { local, remote }) - } - (Some(x), Some(_)) => Ok(UseLocal(x)), - (None, Some(x)) => Ok(Evicted(x)), - (Some(x), None) => Err(DismissedLayer::LocalOnly(x)), - (None, None) => { - unreachable!("there must not be any non-local non-remote files") - } - } - }; + // For layers that were found in authoritative remote metadata, apply a final check that they are within + // the disk_consistent_lsn. + result.extend(remote_layers.into_iter().map(|(name, decision)| { + if name.is_in_future(disk_consistent_lsn) { + match decision { + Decision::Evicted(_remote) => (name, Err(DismissedLayer::Future { local: None })), + Decision::Resident { + local, + remote: _remote, + } => (name, Err(DismissedLayer::Future { local: Some(local) })), + } + } else { + (name, Ok(decision)) + } + })); - (name, decision) - }) - .collect::>() + result } pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { @@ -176,30 +199,20 @@ pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> { std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}")) } -pub(super) fn cleanup_local_file_for_remote( - path: &Utf8Path, - local: &LayerFileMetadata, - remote: &LayerFileMetadata, -) -> anyhow::Result<()> { - let local_size = local.file_size(); - let remote_size = remote.file_size(); - +pub(super) fn cleanup_local_file_for_remote(local: &LocalLayerFileMetadata) -> anyhow::Result<()> { + let local_size = local.file_size; + let path = &local.local_path; let file_name = path.file_name().expect("must be file path"); - tracing::warn!("removing local file {file_name:?} because it has unexpected length {local_size}; length in remote index is {remote_size}"); - if let Err(err) = crate::tenant::timeline::rename_to_backup(path) { - assert!( - path.exists(), - "we would leave the local_layer without a file if this does not hold: {path}", - ); - Err(err) - } else { - Ok(()) - } + tracing::warn!( + "removing local file {file_name:?} because it has unexpected length {local_size};" + ); + + std::fs::remove_file(path).with_context(|| format!("failed to remove layer at {path}")) } pub(super) fn cleanup_future_layer( path: &Utf8Path, - name: &LayerFileName, + name: &LayerName, disk_consistent_lsn: Lsn, ) -> anyhow::Result<()> { // future image layers are allowed to be produced always for not yet flushed to disk @@ -211,12 +224,14 @@ pub(super) fn cleanup_future_layer( } pub(super) fn cleanup_local_only_file( - path: &Utf8Path, - name: &LayerFileName, - local: &LayerFileMetadata, + name: &LayerName, + local: &LocalLayerFileMetadata, ) -> anyhow::Result<()> { let kind = name.kind(); - tracing::info!("found local-only {kind} layer {name}, metadata {local:?}"); - std::fs::remove_file(path)?; + tracing::info!( + "found local-only {kind} layer {name} size {}", + local.file_size + ); + std::fs::remove_file(&local.local_path)?; Ok(()) } diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index e38f5be209..550a9a567a 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -1,4 +1,5 @@ use anyhow::{bail, ensure, Context, Result}; +use itertools::Itertools; use pageserver_api::shard::TenantShardId; use std::{collections::HashMap, sync::Arc}; use tracing::trace; @@ -9,6 +10,7 @@ use utils::{ use crate::{ config::PageServerConf, + context::RequestContext, metrics::TimelineMetrics, tenant::{ layer_map::{BatchedUpdates, LayerMap}, @@ -19,20 +21,16 @@ use crate::{ }, }; +use super::TimelineWriterState; + /// Provides semantic APIs to manipulate the layer map. +#[derive(Default)] pub(crate) struct LayerManager { layer_map: LayerMap, layer_fmgr: LayerFileManager, } impl LayerManager { - pub(crate) fn create() -> Self { - Self { - layer_map: LayerMap::default(), - layer_fmgr: LayerFileManager::new(), - } - } - pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer { self.layer_fmgr.get_from_desc(desc) } @@ -75,6 +73,7 @@ impl LayerManager { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, + ctx: &RequestContext, ) -> Result> { ensure!(lsn.is_aligned()); @@ -111,7 +110,7 @@ impl LayerManager { ); let new_layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?; + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, ctx).await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); @@ -123,17 +122,20 @@ impl LayerManager { Ok(layer) } - /// Called from `freeze_inmem_layer`, returns true if successfully frozen. - pub(crate) async fn try_freeze_in_memory_layer( + /// Tries to freeze an open layer and also manages clearing the TimelineWriterState. + /// + /// Returns true if anything was frozen. + pub(super) async fn try_freeze_in_memory_layer( &mut self, - Lsn(last_record_lsn): Lsn, + lsn: Lsn, last_freeze_at: &AtomicLsn, - ) { + write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + ) -> bool { + let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); - if let Some(open_layer) = &self.layer_map.open_layer { + let froze = if let Some(open_layer) = &self.layer_map.open_layer { let open_layer_rc = Arc::clone(open_layer); - // Does this layer need freezing? open_layer.freeze(end_lsn).await; // The layer is no longer open, update the layer map to reflect this. @@ -141,8 +143,25 @@ impl LayerManager { self.layer_map.frozen_layers.push_back(open_layer_rc); self.layer_map.open_layer = None; self.layer_map.next_open_layer_at = Some(end_lsn); - last_freeze_at.store(end_lsn); - } + + true + } else { + false + }; + + // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this + // accounts for regions in the LSN range where we might have ingested no data due to sharding. + last_freeze_at.store(end_lsn); + + // the writer state must no longer have a reference to the frozen layer + let taken = write_lock.take(); + assert_eq!( + froze, + taken.is_some(), + "should only had frozen a layer when TimelineWriterState existed" + ); + + froze } /// Add image layers to the layer map, called from `create_image_layers`. @@ -207,6 +226,57 @@ impl LayerManager { updates.flush(); } + /// Called when a GC-compaction is completed. + #[cfg(test)] + pub(crate) fn finish_gc_compaction( + &mut self, + compact_from: &[Layer], + compact_to: &[ResidentLayer], + metrics: &TimelineMetrics, + ) { + // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification. + self.finish_compact_l0(compact_from, compact_to, metrics) + } + + /// Called when compaction is completed. + pub(crate) fn rewrite_layers( + &mut self, + rewrite_layers: &[(Layer, ResidentLayer)], + drop_layers: &[Layer], + metrics: &TimelineMetrics, + ) { + let mut updates = self.layer_map.batch_update(); + for (old_layer, new_layer) in rewrite_layers { + debug_assert_eq!( + old_layer.layer_desc().key_range, + new_layer.layer_desc().key_range + ); + debug_assert_eq!( + old_layer.layer_desc().lsn_range, + new_layer.layer_desc().lsn_range + ); + + // Safety: we may never rewrite the same file in-place. Callers are responsible + // for ensuring that they only rewrite layers after something changes the path, + // such as an increment in the generation number. + assert_ne!(old_layer.local_path(), new_layer.local_path()); + + Self::delete_historic_layer(old_layer, &mut updates, &mut self.layer_fmgr); + + Self::insert_historic_layer( + new_layer.as_ref().clone(), + &mut updates, + &mut self.layer_fmgr, + ); + + metrics.record_new_file_metrics(new_layer.layer_desc().file_size); + } + for l in drop_layers { + Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr); + } + updates.flush(); + } + /// Called when garbage collect has selected the layers to be removed. pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) { let mut updates = self.layer_map.batch_update(); @@ -216,6 +286,13 @@ impl LayerManager { updates.flush() } + #[cfg(test)] + pub(crate) fn force_insert_layer(&mut self, layer: ResidentLayer) { + let mut updates = self.layer_map.batch_update(); + Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr); + updates.flush() + } + /// Helper function to insert a layer into the layer map and file manager. fn insert_historic_layer( layer: Layer, @@ -246,20 +323,43 @@ impl LayerManager { layer.delete_on_drop(); } + pub(crate) fn likely_resident_layers(&self) -> impl Iterator + '_ { + // for small layer maps, we most likely have all resident, but for larger more are likely + // to be evicted assuming lots of layers correlated with longer lifespan. + + self.layer_map().iter_historic_layers().filter_map(|desc| { + self.layer_fmgr + .0 + .get(&desc.key()) + .filter(|l| l.is_likely_resident()) + .cloned() + }) + } + pub(crate) fn contains(&self, layer: &Layer) -> bool { self.layer_fmgr.contains(layer) } + + pub(crate) fn all_persistent_layers(&self) -> Vec { + self.layer_fmgr.0.keys().cloned().collect_vec() + } } pub(crate) struct LayerFileManager(HashMap); +impl Default for LayerFileManager { + fn default() -> Self { + Self(HashMap::default()) + } +} + impl LayerFileManager { fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T { // The assumption for the `expect()` is that all code maintains the following invariant: // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor. self.0 .get(&desc.key()) - .with_context(|| format!("get layer from desc: {}", desc.filename())) + .with_context(|| format!("get layer from desc: {}", desc.layer_name())) .expect("not found") .clone() } @@ -275,10 +375,6 @@ impl LayerFileManager { self.0.contains_key(&layer.layer_desc().key()) } - pub(crate) fn new() -> Self { - Self(HashMap::new()) - } - pub(crate) fn remove(&mut self, layer: &T) { let present = self.0.remove(&layer.layer_desc().key()); if present.is_none() && cfg!(debug_assertions) { diff --git a/pageserver/src/tenant/timeline/logical_size.rs b/pageserver/src/tenant/timeline/logical_size.rs index 03bc59ea38..8f9ca0e29f 100644 --- a/pageserver/src/tenant/timeline/logical_size.rs +++ b/pageserver/src/tenant/timeline/logical_size.rs @@ -101,6 +101,14 @@ impl From<&Exact> for u64 { } } +impl Approximate { + /// For use in situations where we don't have a sane logical size value but need + /// to return something, e.g. in HTTP API on shard >0 of a sharded tenant. + pub(crate) fn zero() -> Self { + Self(0) + } +} + impl CurrentLogicalSize { pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 { match self { diff --git a/pageserver/src/tenant/timeline/span.rs b/pageserver/src/tenant/timeline/span.rs index 3b580c9d1b..8b13789179 100644 --- a/pageserver/src/tenant/timeline/span.rs +++ b/pageserver/src/tenant/timeline/span.rs @@ -1,20 +1 @@ -#[cfg(debug_assertions)] -use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor}; -#[cfg(not(debug_assertions))] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {} - -#[cfg(debug_assertions)] -#[track_caller] -pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() { - static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy> = - once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"])); - - let fields: [&dyn Extractor; 2] = [ - &*crate::tenant::span::TENANT_ID_EXTRACTOR, - &*TIMELINE_ID_EXTRACTOR, - ]; - if let Err(missing) = check_fields_present!(fields) { - panic!("missing extractors: {missing:?}") - } -} diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index 27d6fd9c28..2b60e670ea 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -2,8 +2,8 @@ use std::{collections::hash_map::Entry, fs, sync::Arc}; use anyhow::Context; use camino::Utf8PathBuf; -use tracing::{error, info, info_span, warn}; -use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn}; +use tracing::{error, info, info_span}; +use utils::{fs_ext, id::TimelineId, lsn::Lsn}; use crate::{context::RequestContext, import_datadir, tenant::Tenant}; @@ -11,22 +11,22 @@ use super::Timeline; /// A timeline with some of its files on disk, being initialized. /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or -/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory -/// to be removed on next restart. +/// its local files are removed. If we crash while this class exists, then the timeline's local +/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage. /// /// The caller is responsible for proper timeline data filling before the final init. #[must_use] pub struct UninitializedTimeline<'t> { pub(crate) owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, } impl<'t> UninitializedTimeline<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - raw_timeline: Option<(Arc, TimelineUninitMark<'t>)>, + raw_timeline: Option<(Arc, TimelineCreateGuard<'t>)>, ) -> Self { Self { owning_tenant, @@ -35,8 +35,7 @@ impl<'t> UninitializedTimeline<'t> { } } - /// Finish timeline creation: insert it into the Tenant's timelines map and remove the - /// uninit mark file. + /// Finish timeline creation: insert it into the Tenant's timelines map /// /// This function launches the flush loop if not already done. /// @@ -72,16 +71,9 @@ impl<'t> UninitializedTimeline<'t> { Entry::Vacant(v) => { // after taking here should be no fallible operations, because the drop guard will not // cleanup after and would block for example the tenant deletion - let (new_timeline, uninit_mark) = + let (new_timeline, _create_guard) = self.raw_timeline.take().expect("already checked"); - // this is the mutual exclusion between different retries to create the timeline; - // this should be an assertion. - uninit_mark.remove_uninit_mark().with_context(|| { - format!( - "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}" - ) - })?; v.insert(Arc::clone(&new_timeline)); new_timeline.maybe_spawn_flush_loop(); @@ -94,6 +86,7 @@ impl<'t> UninitializedTimeline<'t> { /// Prepares timeline data by loading it from the basebackup archive. pub(crate) async fn import_basebackup_from_tar( self, + tenant: Arc, copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin), base_lsn: Lsn, broker_client: storage_broker::BrokerClientChannel, @@ -120,10 +113,9 @@ impl<'t> UninitializedTimeline<'t> { .await .context("Failed to flush after basebackup import")?; - // All the data has been imported. Insert the Timeline into the tenant's timelines - // map and remove the uninit mark file. + // All the data has been imported. Insert the Timeline into the tenant's timelines map let tl = self.finish_creation()?; - tl.activate(broker_client, None, ctx); + tl.activate(tenant, broker_client, None, ctx); Ok(tl) } @@ -143,37 +135,35 @@ impl<'t> UninitializedTimeline<'t> { impl Drop for UninitializedTimeline<'_> { fn drop(&mut self) { - if let Some((_, uninit_mark)) = self.raw_timeline.take() { + if let Some((_, create_guard)) = self.raw_timeline.take() { let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered(); error!("Timeline got dropped without initializing, cleaning its files"); - cleanup_timeline_directory(uninit_mark); + cleanup_timeline_directory(create_guard); } } } -pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { - let timeline_path = &uninit_mark.timeline_path; +pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) { + let timeline_path = &create_guard.timeline_path; match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { Ok(()) => { - info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") + info!("Timeline dir {timeline_path:?} removed successfully") } Err(e) => { error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}") } } - drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists + // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other + // timeline creation attempts under this TimelineId to proceed + drop(create_guard); } -/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory, -/// or gets removed eventually. -/// -/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first. +/// A guard for timeline creations in process: as long as this object exists, the timeline ID +/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline. #[must_use] -pub(crate) struct TimelineUninitMark<'t> { +pub(crate) struct TimelineCreateGuard<'t> { owning_tenant: &'t Tenant, timeline_id: TimelineId, - uninit_mark_deleted: bool, - uninit_mark_path: Utf8PathBuf, pub(crate) timeline_path: Utf8PathBuf, } @@ -190,11 +180,10 @@ pub(crate) enum TimelineExclusionError { Other(#[from] anyhow::Error), } -impl<'t> TimelineUninitMark<'t> { +impl<'t> TimelineCreateGuard<'t> { pub(crate) fn new( owning_tenant: &'t Tenant, timeline_id: TimelineId, - uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf, ) -> Result { // Lock order: this is the only place we take both locks. During drop() we only @@ -214,56 +203,14 @@ impl<'t> TimelineUninitMark<'t> { Ok(Self { owning_tenant, timeline_id, - uninit_mark_deleted: false, - uninit_mark_path, timeline_path, }) } } - - fn remove_uninit_mark(mut self) -> anyhow::Result<()> { - if !self.uninit_mark_deleted { - self.delete_mark_file_if_present()?; - } - - Ok(()) - } - - fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> { - let uninit_mark_file = &self.uninit_mark_path; - let uninit_mark_parent = uninit_mark_file - .parent() - .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; - fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { - format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") - })?; - crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; - self.uninit_mark_deleted = true; - - Ok(()) - } } -impl Drop for TimelineUninitMark<'_> { +impl Drop for TimelineCreateGuard<'_> { fn drop(&mut self) { - if !self.uninit_mark_deleted { - if self.timeline_path.exists() { - error!( - "Uninit mark {} is not removed, timeline {} stays uninitialized", - self.uninit_mark_path, self.timeline_path - ) - } else { - // unblock later timeline creation attempts - warn!( - "Removing intermediate uninit mark file {}", - self.uninit_mark_path - ); - if let Err(e) = self.delete_mark_file_if_present() { - error!("Failed to remove the uninit mark file: {e}") - } - } - } - self.owning_tenant .timelines_creating .lock() diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 2fab6722b8..a085154a5a 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -24,26 +24,21 @@ mod connection_manager; mod walreceiver_connection; use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; -use pageserver_api::shard::TenantShardId; use std::future::Future; use std::num::NonZeroU64; -use std::ops::ControlFlow; use std::sync::Arc; use std::time::Duration; use storage_broker::BrokerClientChannel; -use tokio::select; use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TimelineId; - use self::connection_manager::ConnectionManagerStatus; use super::Timeline; @@ -62,9 +57,10 @@ pub struct WalReceiverConf { } pub struct WalReceiver { - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, manager_status: Arc>>, + /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. + /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. + cancel: CancellationToken, } impl WalReceiver { @@ -78,65 +74,58 @@ impl WalReceiver { let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverManager, - Some(timeline.tenant_shard_id), - Some(timeline_id), - &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"), - false, + let cancel = timeline.cancel.child_token(); + WALRECEIVER_RUNTIME.spawn({ + let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); + // acquire timeline gate so we know the task doesn't outlive the Timeline + let Ok(_guard) = timeline.gate.enter() else { + debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already"); + return; + }; debug!("WAL receiver manager started, connecting to broker"); let mut connection_manager_state = ConnectionManagerState::new( timeline, conf, + cancel.clone(), ); - loop { - select! { - _ = task_mgr::shutdown_watcher() => { - trace!("WAL receiver shutdown requested, shutting down"); + while !cancel.is_cancelled() { + let loop_step_result = connection_manager_loop_step( + &mut broker_client, + &mut connection_manager_state, + &walreceiver_ctx, + &cancel, + &loop_status, + ).await; + match loop_step_result { + Ok(()) => continue, + Err(_cancelled) => { + trace!("Connection manager loop ended, shutting down"); break; - }, - loop_step_result = connection_manager_loop_step( - &mut broker_client, - &mut connection_manager_state, - &walreceiver_ctx, - &loop_status, - ) => match loop_step_result { - ControlFlow::Continue(()) => continue, - ControlFlow::Break(()) => { - trace!("Connection manager loop ended, shutting down"); - break; - } - }, + } } } - connection_manager_state.shutdown().await; *loop_status.write().unwrap() = None; - Ok(()) + debug!("task exits"); } .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id)) - ); + }); Self { - tenant_shard_id, - timeline_id, manager_status, + cancel, } } - pub async fn stop(self) { - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + #[instrument(skip_all, level = tracing::Level::DEBUG)] + pub fn cancel(&self) { + debug_assert_current_span_has_tenant_and_timeline_id(); + debug!("cancelling walreceiver tasks"); + self.cancel.cancel(); } pub(crate) fn status(&self) -> Option { @@ -170,14 +159,18 @@ enum TaskStateUpdate { impl TaskHandle { /// Initializes the task, starting it immediately after the creation. + /// + /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]). + /// It being a child token enables us to provide a [`Self::shutdown`] method. fn spawn( + cancel_parent: &CancellationToken, task: impl FnOnce(watch::Sender>, CancellationToken) -> Fut + Send + 'static, ) -> Self where Fut: Future> + Send, E: Send + Sync + 'static, { - let cancellation = CancellationToken::new(); + let cancellation = cancel_parent.child_token(); let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); let cancellation_clone = cancellation.clone(); @@ -197,6 +190,9 @@ impl TaskHandle { } } + /// # Cancel-Safety + /// + /// Cancellation-safe. async fn next_task_event(&mut self) -> TaskEvent { match self.events_receiver.changed().await { Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()), diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 7fa5bb7689..1d2ffec08f 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -17,17 +17,19 @@ use crate::metrics::{ WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED, WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES, }; -use crate::task_mgr::{shutdown_token, TaskKind}; +use crate::task_mgr::TaskKind; use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline}; use anyhow::Context; use chrono::{NaiveDateTime, Utc}; use pageserver_api::models::TimelineState; -use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey; -use storage_broker::proto::SafekeeperTimelineInfo; -use storage_broker::proto::SubscribeSafekeeperInfoRequest; + use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; +use storage_broker::proto::{ + FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse, + SubscribeByFilterRequest, TypeSubscription, TypedMessage, +}; use storage_broker::{BrokerClientChannel, Code, Streaming}; -use tokio::select; +use tokio_util::sync::CancellationToken; use tracing::*; use postgres_connection::PgConnectionConfig; @@ -45,27 +47,33 @@ use super::{ TaskEvent, TaskHandle, }; +pub(crate) struct Cancelled; + /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker. /// Based on the updates, desides whether to start, keep or stop a WAL receiver task. /// If storage broker subscription is cancelled, exits. +/// +/// # Cancel-Safety +/// +/// Not cancellation-safe. Use `cancel` token to request cancellation. pub(super) async fn connection_manager_loop_step( broker_client: &mut BrokerClientChannel, connection_manager_state: &mut ConnectionManagerState, ctx: &RequestContext, + cancel: &CancellationToken, manager_status: &std::sync::RwLock>, -) -> ControlFlow<(), ()> { - match connection_manager_state - .timeline - .wait_to_become_active(ctx) - .await - { +) -> Result<(), Cancelled> { + match tokio::select! { + _ = cancel.cancelled() => { return Err(Cancelled); }, + st = connection_manager_state.timeline.wait_to_become_active(ctx) => { st } + } { Ok(()) => {} Err(new_state) => { debug!( ?new_state, "state changed, stopping wal connection manager loop" ); - return ControlFlow::Break(()); + return Err(Cancelled); } } @@ -83,17 +91,28 @@ pub(super) async fn connection_manager_loop_step( .timeline .subscribe_for_state_updates(); + let mut wait_lsn_status = connection_manager_state + .timeline + .subscribe_for_wait_lsn_updates(); + + // TODO: create a separate config option for discovery request interval + let discovery_request_interval = connection_manager_state.conf.lagging_wal_timeout; + let mut last_discovery_ts: Option = None; + // Subscribe to the broker updates. Stream shares underlying TCP connection // with other streams on this client (other connection managers). When // object goes out of scope, stream finishes in drop() automatically. - let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await; + let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; debug!("Subscribed for broker timeline updates"); loop { let time_until_next_retry = connection_manager_state.time_until_next_retry(); + let any_activity = connection_manager_state.wal_connection.is_some() + || !connection_manager_state.wal_stream_candidates.is_empty(); // These things are happening concurrently: // + // - cancellation request // - keep receiving WAL on the current connection // - if the shared state says we need to change connection, disconnect and return // - this runs in a separate task and we receive updates via a watch channel @@ -101,7 +120,12 @@ pub(super) async fn connection_manager_loop_step( // - receive updates from broker // - this might change the current desired connection // - timeline state changes to something that does not allow walreceiver to run concurrently - select! { + // - if there's no connection and no candidates, try to send a discovery request + + // NB: make sure each of the select expressions are cancellation-safe + // (no need for arms to be cancellation-safe). + tokio::select! { + _ = cancel.cancelled() => { return Err(Cancelled); } Some(wal_connection_update) = async { match connection_manager_state.wal_connection.as_mut() { Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await), @@ -133,7 +157,7 @@ pub(super) async fn connection_manager_loop_step( }, // Got a new update from the broker - broker_update = broker_subscription.message() => { + broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => { match broker_update { Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update), Err(status) => { @@ -147,16 +171,17 @@ pub(super) async fn connection_manager_loop_step( warn!("broker subscription failed: {status}"); } } - return ControlFlow::Continue(()); + return Ok(()); } Ok(None) => { error!("broker subscription stream ended"); // can't happen - return ControlFlow::Continue(()); + return Ok(()); } } }, new_event = async { + // Reminder: this match arm needs to be cancellation-safe. loop { if connection_manager_state.timeline.current_state() == TimelineState::Loading { warn!("wal connection manager should only be launched after timeline has become active"); @@ -182,11 +207,11 @@ pub(super) async fn connection_manager_loop_step( } } => match new_event { ControlFlow::Continue(()) => { - return ControlFlow::Continue(()); + return Ok(()); } ControlFlow::Break(()) => { debug!("Timeline is no longer active, stopping wal connection manager loop"); - return ControlFlow::Break(()); + return Err(Cancelled); } }, @@ -202,6 +227,65 @@ pub(super) async fn connection_manager_loop_step( } } } => debug!("Waking up for the next retry after waiting for {time_until_next_retry:?}"), + + Some(()) = async { + // Reminder: this match arm needs to be cancellation-safe. + // Calculating time needed to wait until sending the next discovery request. + // Current implementation is conservative and sends discovery requests only when there are no candidates. + + if any_activity { + // No need to send discovery requests if there is an active connection or candidates. + return None; + } + + // Waiting for an active wait_lsn request. + while wait_lsn_status.borrow().is_none() { + if wait_lsn_status.changed().await.is_err() { + // wait_lsn_status channel was closed, exiting + warn!("wait_lsn_status channel was closed in connection_manager_loop_step"); + return None; + } + } + + // All preconditions met, preparing to send a discovery request. + let now = std::time::Instant::now(); + let next_discovery_ts = last_discovery_ts + .map(|ts| ts + discovery_request_interval) + .unwrap_or_else(|| now); + + if next_discovery_ts > now { + // Prevent sending discovery requests too frequently. + tokio::time::sleep(next_discovery_ts - now).await; + } + + let tenant_timeline_id = Some(ProtoTenantTimelineId { + tenant_id: id.tenant_id.as_ref().to_owned(), + timeline_id: id.timeline_id.as_ref().to_owned(), + }); + let request = SafekeeperDiscoveryRequest { tenant_timeline_id }; + let msg = TypedMessage { + r#type: MessageType::SafekeeperDiscoveryRequest as i32, + safekeeper_timeline_info: None, + safekeeper_discovery_request: Some(request), + safekeeper_discovery_response: None, + }; + + last_discovery_ts = Some(std::time::Instant::now()); + debug!("No active connection and no candidates, sending discovery request to the broker"); + + // Cancellation safety: we want to send a message to the broker, but publish_one() + // function can get cancelled by the other select! arm. This is absolutely fine, because + // we just want to receive broker updates and discovery is not important if we already + // receive updates. + // + // It is possible that `last_discovery_ts` will be updated, but the message will not be sent. + // This is totally fine because of the reason above. + + // This is a fire-and-forget request, we don't care about the response + let _ = broker_client.publish_one(msg).await; + debug!("Discovery request sent to the broker"); + None + } => {} } if let Some(new_candidate) = connection_manager_state.next_connection_candidate() { @@ -218,32 +302,46 @@ pub(super) async fn connection_manager_loop_step( async fn subscribe_for_timeline_updates( broker_client: &mut BrokerClientChannel, id: TenantTimelineId, -) -> Streaming { + cancel: &CancellationToken, +) -> Result, Cancelled> { let mut attempt = 0; - let cancel = shutdown_token(); - loop { exponential_backoff( attempt, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, - &cancel, + cancel, ) .await; attempt += 1; // subscribe to the specific timeline - let key = SubscriptionKey::TenantTimelineId(ProtoTenantTimelineId { - tenant_id: id.tenant_id.as_ref().to_owned(), - timeline_id: id.timeline_id.as_ref().to_owned(), - }); - let request = SubscribeSafekeeperInfoRequest { - subscription_key: Some(key), + let request = SubscribeByFilterRequest { + types: vec![ + TypeSubscription { + r#type: MessageType::SafekeeperTimelineInfo as i32, + }, + TypeSubscription { + r#type: MessageType::SafekeeperDiscoveryResponse as i32, + }, + ], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: true, + tenant_timeline_id: Some(ProtoTenantTimelineId { + tenant_id: id.tenant_id.as_ref().to_owned(), + timeline_id: id.timeline_id.as_ref().to_owned(), + }), + }), }; - match broker_client.subscribe_safekeeper_info(request).await { + match { + tokio::select! { + r = broker_client.subscribe_by_filter(request) => { r } + _ = cancel.cancelled() => { return Err(Cancelled); } + } + } { Ok(resp) => { - return resp.into_inner(); + return Ok(resp.into_inner()); } Err(e) => { // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and @@ -264,6 +362,8 @@ pub(super) struct ConnectionManagerState { id: TenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, + /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn. + cancel: CancellationToken, conf: WalReceiverConf, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, @@ -380,13 +480,17 @@ struct RetryInfo { /// Data about the timeline to connect to, received from the broker. #[derive(Debug, Clone)] struct BrokerSkTimeline { - timeline: SafekeeperTimelineInfo, + timeline: SafekeeperDiscoveryResponse, /// Time at which the data was fetched from the broker last time, to track the stale data. latest_update: NaiveDateTime, } impl ConnectionManagerState { - pub(super) fn new(timeline: Arc, conf: WalReceiverConf) -> Self { + pub(super) fn new( + timeline: Arc, + conf: WalReceiverConf, + cancel: CancellationToken, + ) -> Self { let id = TenantTimelineId { tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, @@ -394,6 +498,7 @@ impl ConnectionManagerState { Self { id, timeline, + cancel, conf, wal_connection: None, wal_stream_candidates: HashMap::new(), @@ -401,6 +506,22 @@ impl ConnectionManagerState { } } + fn spawn( + &self, + task: impl FnOnce( + tokio::sync::watch::Sender>, + CancellationToken, + ) -> Fut + + Send + + 'static, + ) -> TaskHandle + where + Fut: std::future::Future> + Send, + { + // TODO: get rid of TaskHandle + super::TaskHandle::spawn(&self.cancel, task) + } + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { WALRECEIVER_SWITCHES @@ -419,7 +540,7 @@ impl ConnectionManagerState { ); let span = info_span!("connection", %node_id); - let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { + let connection_handle = self.spawn(move |events_sender, cancellation| { async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -447,6 +568,12 @@ impl ConnectionManagerState { info!("walreceiver connection handling ended: {e}"); Ok(()) } + WalReceiverError::ClosedGate => { + info!( + "walreceiver connection handling ended because of closed gate" + ); + Ok(()) + } WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { @@ -486,6 +613,10 @@ impl ConnectionManagerState { /// Drops the current connection (if any) and updates retry timeout for the next /// connection attempt to the same safekeeper. + /// + /// # Cancel-Safety + /// + /// Not cancellation-safe. async fn drop_old_connection(&mut self, needs_shutdown: bool) { let wal_connection = match self.wal_connection.take() { Some(wal_connection) => wal_connection, @@ -493,7 +624,14 @@ impl ConnectionManagerState { }; if needs_shutdown { - wal_connection.connection_task.shutdown().await; + wal_connection + .connection_task + .shutdown() + // This here is why this function isn't cancellation-safe. + // If we got cancelled here, then self.wal_connection is already None and we lose track of the task. + // Even if our caller diligently calls Self::shutdown(), it will find a self.wal_connection=None + // and thus be ineffective. + .await; } let retry = self @@ -550,9 +688,59 @@ impl ConnectionManagerState { } /// Adds another broker timeline into the state, if its more recent than the one already added there for the same key. - fn register_timeline_update(&mut self, timeline_update: SafekeeperTimelineInfo) { + fn register_timeline_update(&mut self, typed_msg: TypedMessage) { + let mut is_discovery = false; + let timeline_update = match typed_msg.r#type() { + MessageType::SafekeeperTimelineInfo => { + let info = match typed_msg.safekeeper_timeline_info { + Some(info) => info, + None => { + warn!("bad proto message from broker: no safekeeper_timeline_info"); + return; + } + }; + SafekeeperDiscoveryResponse { + safekeeper_id: info.safekeeper_id, + tenant_timeline_id: info.tenant_timeline_id, + commit_lsn: info.commit_lsn, + safekeeper_connstr: info.safekeeper_connstr, + availability_zone: info.availability_zone, + standby_horizon: info.standby_horizon, + } + } + MessageType::SafekeeperDiscoveryResponse => { + is_discovery = true; + match typed_msg.safekeeper_discovery_response { + Some(response) => response, + None => { + warn!("bad proto message from broker: no safekeeper_discovery_response"); + return; + } + } + } + _ => { + // unexpected message + return; + } + }; + WALRECEIVER_BROKER_UPDATES.inc(); + trace!( + "safekeeper info update: standby_horizon(cutoff)={}", + timeline_update.standby_horizon + ); + if timeline_update.standby_horizon != 0 { + // ignore reports from safekeepers not connected to replicas + self.timeline + .standby_horizon + .store(Lsn(timeline_update.standby_horizon)); + self.timeline + .metrics + .standby_horizon_gauge + .set(timeline_update.standby_horizon as i64); + } + let new_safekeeper_id = NodeId(timeline_update.safekeeper_id); let old_entry = self.wal_stream_candidates.insert( new_safekeeper_id, @@ -563,7 +751,11 @@ impl ConnectionManagerState { ); if old_entry.is_none() { - info!("New SK node was added: {new_safekeeper_id}"); + info!( + ?is_discovery, + %new_safekeeper_id, + "New SK node was added", + ); WALRECEIVER_CANDIDATES_ADDED.inc(); } } @@ -762,7 +954,7 @@ impl ConnectionManagerState { fn select_connection_candidate( &self, node_to_omit: Option, - ) -> Option<(NodeId, &SafekeeperTimelineInfo, PgConnectionConfig)> { + ) -> Option<(NodeId, &SafekeeperDiscoveryResponse, PgConnectionConfig)> { self.applicable_connection_candidates() .filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit) .max_by_key(|(_, info, _)| info.commit_lsn) @@ -772,7 +964,7 @@ impl ConnectionManagerState { /// Some safekeepers are filtered by the retry cooldown. fn applicable_connection_candidates( &self, - ) -> impl Iterator { + ) -> impl Iterator { let now = Utc::now().naive_utc(); self.wal_stream_candidates @@ -838,6 +1030,9 @@ impl ConnectionManagerState { } } + /// # Cancel-Safety + /// + /// Not cancellation-safe. pub(super) async fn shutdown(mut self) { if let Some(wal_connection) = self.wal_connection.take() { wal_connection.connection_task.shutdown().await; @@ -909,20 +1104,13 @@ mod tests { latest_update: NaiveDateTime, ) -> BrokerSkTimeline { BrokerSkTimeline { - timeline: SafekeeperTimelineInfo { + timeline: SafekeeperDiscoveryResponse { safekeeper_id: 0, tenant_timeline_id: None, - term: 0, - last_log_term: 0, - flush_lsn: 0, commit_lsn, - backup_lsn: 0, - remote_consistent_lsn: 0, - peer_horizon_lsn: 0, - local_start_lsn: 0, safekeeper_connstr: safekeeper_connstr.to_owned(), - http_connstr: safekeeper_connstr.to_owned(), availability_zone: None, + standby_horizon: 0, }, latest_update, } @@ -986,7 +1174,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1154,7 +1342,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1221,7 +1409,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1285,7 +1473,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), + connection_task: state.spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { discovered_at: time_over_threshold, lsn: new_lsn, @@ -1337,10 +1525,11 @@ mod tests { ConnectionManagerState { id: TenantTimelineId { - tenant_id: harness.tenant_id, + tenant_id: harness.tenant_shard_id.tenant_id, timeline_id: TIMELINE_ID, }, timeline, + cancel: CancellationToken::new(), conf: WalReceiverConf { wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), @@ -1363,7 +1552,7 @@ mod tests { let harness = TenantHarness::create("switch_to_same_availability_zone")?; let mut state = dummy_state(&harness).await; - state.conf.availability_zone = test_az.clone(); + state.conf.availability_zone.clone_from(&test_az); let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1384,7 +1573,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1396,7 +1585,7 @@ mod tests { // We have another safekeeper with the same commit_lsn, and it have the same availability zone as // the current pageserver. let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now); - same_az_sk.timeline.availability_zone = test_az.clone(); + same_az_sk.timeline.availability_zone.clone_from(&test_az); state.wal_stream_candidates = HashMap::from([ ( diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index e398d683e5..c6ee6b90c4 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -27,7 +27,6 @@ use super::TaskStateUpdate; use crate::{ context::RequestContext, metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, @@ -37,8 +36,8 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::pageserver_feedback::PageserverFeedback; use utils::{id::NodeId, lsn::Lsn}; +use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -68,6 +67,7 @@ pub(super) enum WalReceiverError { SuccessfulCompletion(String), /// Generic error Other(anyhow::Error), + ClosedGate, } impl From for WalReceiverError { @@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection( ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); + // prevent timeline shutdown from finishing until we have exited + let _guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + // This function spawns a side-car task (WalReceiverConnectionPoller). + // Get its gate guard now as well. + let poller_guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + WALRECEIVER_STARTED_CONNECTIONS.inc(); // Connect to the database in replication mode. @@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection( } // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. + // so spawn it off to run on its own. It shouldn't outlive this function, but, + // due to lack of async drop, we can't enforce that. However, we ensure that + // 1. it is sensitive to `cancellation` and + // 2. holds the Timeline gate open so that after timeline shutdown, + // we know this task is gone. let _connection_ctx = ctx.detached_child( TaskKind::WalReceiverConnectionPoller, ctx.download_behavior(), ); let connection_cancellation = cancellation.clone(); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - "walreceiver connection", - false, + WALRECEIVER_RUNTIME.spawn( async move { debug_assert_current_span_has_tenant_and_timeline_id(); - select! { connection_result = connection => match connection_result { Ok(()) => debug!("Walreceiver db connection closed"), @@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection( // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} + WalReceiverError::ClosedGate => { + // doesn't happen at runtime + } WalReceiverError::Other(err) => { warn!("Connection aborted: {err:#}") } @@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection( }, _ = connection_cancellation.cancelled() => debug!("Connection cancelled"), } - Ok(()) + drop(poller_guard); } // Enrich the log lines emitted by this closure with meaningful context. // TODO: technically, this task outlives the surrounding function, so, the @@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection( trace!("received XLogData between {startlsn} and {endlsn}"); + WAL_INGEST.bytes_received.inc_by(data.len() as u64); waldecoder.feed_bytes(data); { @@ -389,16 +400,6 @@ pub(super) async fn handle_walreceiver_connection( } } - timeline - .check_checkpoint_distance() - .await - .with_context(|| { - format!( - "Failed to check checkpoint distance for timeline {}", - timeline.timeline_id - ) - })?; - if let Some(last_lsn) = status_update { let timeline_remote_consistent_lsn = timeline .get_remote_consistent_lsn_visible() @@ -426,19 +427,28 @@ pub(super) async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. - let current_timeline_size = timeline - .get_current_logical_size( - crate::tenant::timeline::GetLogicalSizePriority::User, - &ctx, - ) - // FIXME: https://github.com/neondatabase/neon/issues/5963 - .size_dont_care_about_accuracy(); + let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() { + timeline + .get_current_logical_size( + crate::tenant::timeline::GetLogicalSizePriority::User, + &ctx, + ) + // FIXME: https://github.com/neondatabase/neon/issues/5963 + .size_dont_care_about_accuracy() + } else { + // Non-zero shards send zero for logical size. The safekeeper will ignore + // this number. This is because in a sharded tenant, only shard zero maintains + // accurate logical size. + 0 + }; + let status_update = PageserverFeedback { current_timeline_size, last_received_lsn, disk_consistent_lsn, remote_consistent_lsn, replytime: ts, + shard_number: timeline.tenant_shard_id.shard_number.0 as u32, }; debug!("neon_status_update {status_update:?}"); diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 32f14f40c5..50c977a950 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -1,4 +1,4 @@ -use super::storage_layer::LayerFileName; +use super::storage_layer::LayerName; use super::storage_layer::ResidentLayer; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::remote_timeline_client::index::IndexPart; @@ -43,28 +43,25 @@ pub(crate) struct UploadQueueInitialized { /// Counter to assign task IDs pub(crate) task_counter: u64, - /// All layer files stored in the remote storage, taking into account all - /// in-progress and queued operations - pub(crate) latest_files: HashMap, + /// The next uploaded index_part.json; assumed to be dirty. + /// + /// Should not be read, directly except for layer file updates. Instead you should add a + /// projected field. + pub(crate) dirty: IndexPart, + + /// The latest remote persisted IndexPart. + /// + /// Each completed metadata upload will update this. The second item is the task_id which last + /// updated the value, used to ensure we never store an older value over a newer one. + pub(crate) clean: (IndexPart, Option), /// How many file uploads or deletions been scheduled, since the /// last (scheduling of) metadata index upload? pub(crate) latest_files_changes_since_metadata_upload_scheduled: u64, - /// Metadata stored in the remote storage, taking into account all - /// in-progress and queued operations. - /// DANGER: do not return to outside world, e.g., safekeepers. - pub(crate) latest_metadata: TimelineMetadata, - - /// `disk_consistent_lsn` from the last metadata file that was successfully - /// uploaded. `Lsn(0)` if nothing was uploaded yet. - /// Unlike `latest_files` or `latest_metadata`, this value is never ahead. - /// Safekeeper can rely on it to make decisions for WAL storage. - /// - /// visible_remote_consistent_lsn is only updated after our generation has been validated with + /// The Lsn is only updated after our generation has been validated with /// the control plane (unlesss a timeline's generation is None, in which case /// we skip validation) - pub(crate) projected_remote_consistent_lsn: Option, pub(crate) visible_remote_consistent_lsn: Arc, // Breakdown of different kinds of tasks currently in-progress @@ -89,7 +86,7 @@ pub(crate) struct UploadQueueInitialized { /// Putting this behind a testing feature to catch problems in tests, but assuming we could have a /// bug causing leaks, then it's better to not leave this enabled for production builds. #[cfg(feature = "testing")] - pub(crate) dangling_files: HashMap, + pub(crate) dangling_files: HashMap, /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`. pub(crate) shutting_down: bool, @@ -110,7 +107,8 @@ impl UploadQueueInitialized { } pub(super) fn get_last_remote_consistent_lsn_projected(&self) -> Option { - self.projected_remote_consistent_lsn + let lsn = self.clean.0.metadata.disk_consistent_lsn(); + self.clean.1.map(|_| lsn) } } @@ -121,11 +119,37 @@ pub(super) enum SetDeletedFlagProgress { Successful(NaiveDateTime), } -pub(super) struct UploadQueueStopped { +pub(super) struct UploadQueueStoppedDeletable { pub(super) upload_queue_for_deletion: UploadQueueInitialized, pub(super) deleted_at: SetDeletedFlagProgress, } +pub(super) enum UploadQueueStopped { + Deletable(UploadQueueStoppedDeletable), + Uninitialized, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum NotInitialized { + #[error("queue is in state Uninitialized")] + Uninitialized, + #[error("queue is in state Stopped")] + Stopped, + #[error("queue is shutting down")] + ShuttingDown, +} + +impl NotInitialized { + pub(crate) fn is_stopping(&self) -> bool { + use NotInitialized::*; + match self { + Uninitialized => false, + Stopped => true, + ShuttingDown => true, + } + } +} + impl UploadQueue { pub(crate) fn initialize_empty_remote( &mut self, @@ -140,12 +164,12 @@ impl UploadQueue { info!("initializing upload queue for empty remote"); + let index_part = IndexPart::empty(metadata.clone()); + let state = UploadQueueInitialized { - // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead. - latest_files: HashMap::new(), + dirty: index_part.clone(), + clean: (index_part, None), latest_files_changes_since_metadata_upload_scheduled: 0, - latest_metadata: metadata.clone(), - projected_remote_consistent_lsn: None, visible_remote_consistent_lsn: Arc::new(AtomicLsn::new(0)), // what follows are boring default initializations task_counter: 0, @@ -175,24 +199,15 @@ impl UploadQueue { } } - let mut files = HashMap::with_capacity(index_part.layer_metadata.len()); - for (layer_name, layer_metadata) in &index_part.layer_metadata { - files.insert( - layer_name.to_owned(), - LayerFileMetadata::from(layer_metadata), - ); - } - info!( "initializing upload queue with remote index_part.disk_consistent_lsn: {}", index_part.metadata.disk_consistent_lsn() ); let state = UploadQueueInitialized { - latest_files: files, + dirty: index_part.clone(), + clean: (index_part.clone(), None), latest_files_changes_since_metadata_upload_scheduled: 0, - latest_metadata: index_part.metadata.clone(), - projected_remote_consistent_lsn: Some(index_part.metadata.disk_consistent_lsn()), visible_remote_consistent_lsn: Arc::new( index_part.metadata.disk_consistent_lsn().into(), ), @@ -214,26 +229,29 @@ impl UploadQueue { } pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { + use UploadQueue::*; match self { - UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { - anyhow::bail!("queue is in state {}", self.as_str()) - } - UploadQueue::Initialized(x) => { - if !x.shutting_down { - Ok(x) + Uninitialized => Err(NotInitialized::Uninitialized.into()), + Initialized(x) => { + if x.shutting_down { + Err(NotInitialized::ShuttingDown.into()) } else { - anyhow::bail!("queue is shutting down") + Ok(x) } } + Stopped(_) => Err(NotInitialized::Stopped.into()), } } - pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> { + pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStoppedDeletable> { match self { UploadQueue::Initialized(_) | UploadQueue::Uninitialized => { anyhow::bail!("queue is in state {}", self.as_str()) } - UploadQueue::Stopped(stopped) => Ok(stopped), + UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => { + anyhow::bail!("queue is in state Stopped(Uninitialized)") + } + UploadQueue::Stopped(UploadQueueStopped::Deletable(deletable)) => Ok(deletable), } } } @@ -252,7 +270,7 @@ pub(crate) struct UploadTask { /// for timeline deletion, which skips this queue and goes directly to DeletionQueue. #[derive(Debug)] pub(crate) struct Delete { - pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>, + pub(crate) layers: Vec<(LayerName, LayerFileMetadata)>, } #[derive(Debug)] @@ -260,13 +278,16 @@ pub(crate) enum UploadOp { /// Upload a layer file UploadLayer(ResidentLayer, LayerFileMetadata), - /// Upload the metadata file - UploadMetadata(IndexPart, Lsn), + /// Upload a index_part.json file + UploadMetadata { + /// The next [`UploadQueueInitialized::clean`] after this upload succeeds. + uploaded: Box, + }, /// Delete layer files Delete(Delete), - /// Barrier. When the barrier operation is reached, + /// Barrier. When the barrier operation is reached, the channel is closed. Barrier(tokio::sync::watch::Sender<()>), /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise @@ -281,13 +302,15 @@ impl std::fmt::Display for UploadOp { write!( f, "UploadLayer({}, size={:?}, gen={:?})", - layer, - metadata.file_size(), - metadata.generation + layer, metadata.file_size, metadata.generation ) } - UploadOp::UploadMetadata(_, lsn) => { - write!(f, "UploadMetadata(lsn: {})", lsn) + UploadOp::UploadMetadata { uploaded, .. } => { + write!( + f, + "UploadMetadata(lsn: {})", + uploaded.metadata.disk_consistent_lsn() + ) } UploadOp::Delete(delete) => { write!(f, "Delete({} layers)", delete.layers.len()) diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs new file mode 100644 index 0000000000..6e825760e3 --- /dev/null +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -0,0 +1,449 @@ +//! +//! Utilities for vectored reading of variable-sized "blobs". +//! +//! The "blob" api is an abstraction on top of the "block" api, +//! with the main difference being that blobs do not have a fixed +//! size (each blob is prefixed with 1 or 4 byte length field) +//! +//! The vectored apis provided in this module allow for planning +//! and executing disk IO which covers multiple blobs. +//! +//! Reads are planned with [`VectoredReadPlanner`] which will coalesce +//! adjacent blocks into a single disk IO request and exectuted by +//! [`VectoredBlobReader`] which does all the required offset juggling +//! and returns a buffer housing all the blobs and a list of offsets. +//! +//! Note that the vectored blob api does *not* go through the page cache. + +use std::collections::BTreeMap; +use std::num::NonZeroUsize; + +use bytes::BytesMut; +use pageserver_api::key::Key; +use utils::lsn::Lsn; +use utils::vec_map::VecMap; + +use crate::context::RequestContext; +use crate::virtual_file::VirtualFile; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct MaxVectoredReadBytes(pub NonZeroUsize); + +/// Metadata bundled with the start and end offset of a blob. +#[derive(Copy, Clone, Debug)] +pub struct BlobMeta { + pub key: Key, + pub lsn: Lsn, +} + +/// Blob offsets into [`VectoredBlobsBuf::buf`] +pub struct VectoredBlob { + pub start: usize, + pub end: usize, + pub meta: BlobMeta, +} + +/// Return type of [`VectoredBlobReader::read_blobs`] +pub struct VectoredBlobsBuf { + /// Buffer for all blobs in this read + pub buf: BytesMut, + /// Offsets into the buffer and metadata for all blobs in this read + pub blobs: Vec, +} + +/// Description of one disk read for multiple blobs. +/// Used as the argument form [`VectoredBlobReader::read_blobs`] +#[derive(Debug)] +pub struct VectoredRead { + pub start: u64, + pub end: u64, + /// Starting offsets and metadata for each blob in this read + pub blobs_at: VecMap, +} + +impl VectoredRead { + pub(crate) fn size(&self) -> usize { + (self.end - self.start) as usize + } +} + +#[derive(Eq, PartialEq)] +pub(crate) enum VectoredReadExtended { + Yes, + No, +} + +pub(crate) struct VectoredReadBuilder { + start: u64, + end: u64, + blobs_at: VecMap, + max_read_size: usize, +} + +impl VectoredReadBuilder { + /// Start building a new vectored read. + /// + /// Note that by design, this does not check against reading more than `max_read_size` to + /// support reading larger blobs than the configuration value. The builder will be single use + /// however after that. + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: usize, + ) -> Self { + let mut blobs_at = VecMap::default(); + blobs_at + .append(start_offset, meta) + .expect("First insertion always succeeds"); + + Self { + start: start_offset, + end: end_offset, + blobs_at, + max_read_size, + } + } + + /// Attempt to extend the current read with a new blob if the start + /// offset matches with the current end of the vectored read + /// and the resuting size is below the max read size + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + tracing::trace!(start, end, "trying to extend"); + let size = (end - start) as usize; + if self.end == start && self.size() + size <= self.max_read_size { + self.end = end; + self.blobs_at + .append(start, meta) + .expect("LSNs are ordered within vectored reads"); + + return VectoredReadExtended::Yes; + } + + VectoredReadExtended::No + } + + pub(crate) fn size(&self) -> usize { + (self.end - self.start) as usize + } + + pub(crate) fn build(self) -> VectoredRead { + VectoredRead { + start: self.start, + end: self.end, + blobs_at: self.blobs_at, + } + } +} + +#[derive(Copy, Clone, Debug)] +pub enum BlobFlag { + None, + Ignore, + ReplaceAll, +} + +/// Planner for vectored blob reads. +/// +/// Blob offsets are received via [`VectoredReadPlanner::handle`] +/// and coalesced into disk reads. +/// +/// The implementation is very simple: +/// * Collect all blob offsets in an ordered structure +/// * Iterate over the collected blobs and coalesce them into reads at the end +pub struct VectoredReadPlanner { + // Track all the blob offsets. Start offsets must be ordered. + blobs: BTreeMap>, + // Arguments for previous blob passed into [`VectoredReadPlanner::handle`] + prev: Option<(Key, Lsn, u64, BlobFlag)>, + + max_read_size: usize, +} + +impl VectoredReadPlanner { + pub fn new(max_read_size: usize) -> Self { + Self { + blobs: BTreeMap::new(), + prev: None, + max_read_size, + } + } + + /// Include a new blob in the read plan. + /// + /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads` + /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all + /// keys in a given keyspace. This function must be called for each key in the desired + /// keyspace (monotonically continuous). [`Self::handle_range_end`] must + /// be called after every range in the offset. + /// + /// In the event that keys are skipped, the behaviour is undefined and can lead to an + /// incorrect read plan. We can end up asserting, erroring in wal redo or returning + /// incorrect data to the user. + /// + /// The `flag` argument has two interesting values: + /// * [`BlobFlag::ReplaceAll`]: The blob for this key should replace all existing blobs. + /// This is used for WAL records that `will_init`. + /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens + /// if the blob is cached. + pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) { + // Implementation note: internally lag behind by one blob such that + // we have a start and end offset when initialising [`VectoredRead`] + let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev { + None => { + self.prev = Some((key, lsn, offset, flag)); + return; + } + Some(prev) => prev, + }; + + self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); + + self.prev = Some((key, lsn, offset, flag)); + } + + pub fn handle_range_end(&mut self, offset: u64) { + if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev { + self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); + } + + self.prev = None; + } + + fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) { + match flag { + BlobFlag::None => { + let blobs_for_key = self.blobs.entry(key).or_default(); + blobs_for_key.push((lsn, start_offset, end_offset)); + } + BlobFlag::ReplaceAll => { + let blobs_for_key = self.blobs.entry(key).or_default(); + blobs_for_key.clear(); + blobs_for_key.push((lsn, start_offset, end_offset)); + } + BlobFlag::Ignore => {} + } + } + + pub fn finish(self) -> Vec { + let mut current_read_builder: Option = None; + let mut reads = Vec::new(); + + for (key, blobs_for_key) in self.blobs { + for (lsn, start_offset, end_offset) in blobs_for_key { + let extended = match &mut current_read_builder { + Some(read_builder) => { + read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }) + } + None => VectoredReadExtended::No, + }; + + if extended == VectoredReadExtended::No { + let next_read_builder = VectoredReadBuilder::new( + start_offset, + end_offset, + BlobMeta { key, lsn }, + self.max_read_size, + ); + + let prev_read_builder = current_read_builder.replace(next_read_builder); + + // `current_read_builder` is None in the first iteration of the outer loop + if let Some(read_builder) = prev_read_builder { + reads.push(read_builder.build()); + } + } + } + } + + if let Some(read_builder) = current_read_builder { + reads.push(read_builder.build()); + } + + reads + } +} + +/// Disk reader for vectored blob spans (does not go through the page cache) +pub struct VectoredBlobReader<'a> { + file: &'a VirtualFile, +} + +impl<'a> VectoredBlobReader<'a> { + pub fn new(file: &'a VirtualFile) -> Self { + Self { file } + } + + /// Read the requested blobs into the buffer. + /// + /// We have to deal with the fact that blobs are not fixed size. + /// Each blob is prefixed by a size header. + /// + /// The success return value is a struct which contains the buffer + /// filled from disk and a list of offsets at which each blob lies + /// in the buffer. + pub async fn read_blobs( + &self, + read: &VectoredRead, + buf: BytesMut, + ctx: &RequestContext, + ) -> Result { + assert!(read.size() > 0); + assert!( + read.size() <= buf.capacity(), + "{} > {}", + read.size(), + buf.capacity() + ); + let buf = self + .file + .read_exact_at_n(buf, read.start, read.size(), ctx) + .await?; + + let blobs_at = read.blobs_at.as_slice(); + let start_offset = blobs_at.first().expect("VectoredRead is never empty").0; + + let mut metas = Vec::with_capacity(blobs_at.len()); + + // Blobs in `read` only provide their starting offset. The end offset + // of a blob is implicit: the start of the next blob if one exists + // or the end of the read. + let pairs = blobs_at.iter().zip( + blobs_at + .iter() + .map(Some) + .skip(1) + .chain(std::iter::once(None)), + ); + + for ((offset, meta), next) in pairs { + let offset_in_buf = offset - start_offset; + let first_len_byte = buf[offset_in_buf as usize]; + + // Each blob is prefixed by a header containing it's size. + // Extract the size and skip that header to find the start of the data. + // The size can be 1 or 4 bytes. The most significant bit is 0 in the + // 1 byte case and 1 in the 4 byte case. + let (size_length, blob_size) = if first_len_byte < 0x80 { + (1, first_len_byte as u64) + } else { + let mut blob_size_buf = [0u8; 4]; + let offset_in_buf = offset_in_buf as usize; + + blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]); + blob_size_buf[0] &= 0x7f; + (4, u32::from_be_bytes(blob_size_buf) as u64) + }; + + let start = offset_in_buf + size_length; + let end = match next { + Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset, + None => start + blob_size, + }; + + assert_eq!(end - start, blob_size); + + metas.push(VectoredBlob { + start: start as usize, + end: end as usize, + meta: *meta, + }) + } + + Ok(VectoredBlobsBuf { buf, blobs: metas }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { + assert_eq!(read.start, offset_range.first().unwrap().2); + + let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect(); + + let offsets_in_read: Vec<_> = read + .blobs_at + .as_slice() + .iter() + .map(|(offset, _)| *offset) + .collect(); + + assert_eq!(expected_offsets_in_read, offsets_in_read); + } + + #[test] + fn planner_max_read_size_test() { + let max_read_size = 128 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1 + (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2 + (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3 + (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4 + (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5 + (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6 + ]; + + let ranges = [ + &blob_descriptions[0..3], + &blob_descriptions[3..4], + &blob_descriptions[4..5], + &blob_descriptions[5..6], + &blob_descriptions[6..7], + &blob_descriptions[7..], + ]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions.clone() { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(652 * 1024); + + let reads = planner.finish(); + assert_eq!(reads.len(), 6); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn planner_replacement_test() { + let max_read_size = 128 * 1024; + let first_key = Key::MIN; + let second_key = first_key.next(); + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (first_key, lsn, 0, BlobFlag::None), // First in read 1 + (first_key, lsn, 1024, BlobFlag::None), // Last in read 1 + (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll), + (second_key, lsn, 3 * 1024, BlobFlag::None), + (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2 + (second_key, lsn, 5 * 1024, BlobFlag::None), // Last in read 2 + ]; + + let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions.clone() { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(6 * 1024); + + let reads = planner.finish(); + assert_eq!(reads.len(), 2); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } +} diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs new file mode 100644 index 0000000000..e6c835aa75 --- /dev/null +++ b/pageserver/src/utilization.rs @@ -0,0 +1,50 @@ +//! An utilization metric which is used to decide on which pageserver to put next tenant. +//! +//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the +//! truth. + +use anyhow::Context; +use std::path::Path; + +use pageserver_api::models::PageserverUtilization; + +pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result { + // TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough + + let statvfs = nix::sys::statvfs::statvfs(tenants_path) + .map_err(std::io::Error::from) + .context("statvfs tenants directory")?; + + // https://unix.stackexchange.com/a/703650 + let blocksz = if statvfs.fragment_size() > 0 { + statvfs.fragment_size() + } else { + statvfs.block_size() + }; + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let free = statvfs.blocks_available() as u64 * blocksz; + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let used = statvfs + .blocks() + // use blocks_free instead of available here to match df in case someone compares + .saturating_sub(statvfs.blocks_free()) as u64 + * blocksz; + + let captured_at = std::time::SystemTime::now(); + + let doc = PageserverUtilization { + disk_usage_bytes: used, + free_space_bytes: free, + // lower is better; start with a constant + // + // note that u64::MAX will be output as i64::MAX as u64, but that should not matter + utilization_score: u64::MAX, + captured_at: utils::serde_system_time::SystemTime(captured_at), + }; + + // TODO: make utilization_score into a metric + + Ok(doc) +} diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 10bed7ca06..04d9386fab 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -10,16 +10,49 @@ //! This is similar to PostgreSQL's virtual file descriptor facility in //! src/backend/storage/file/fd.c //! +use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; + +use crate::page_cache::PageWriteGuard; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; -use std::fs::{self, File, OpenOptions}; +use pageserver_api::shard::TenantShardId; +use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; -use std::os::unix::fs::FileExt; +use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice}; + +use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd}; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; -use std::sync::{RwLock, RwLockWriteGuard}; -use utils::fs_ext; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; +use tokio::time::Instant; + +pub use pageserver_api::models::virtual_file as api; +pub(crate) mod io_engine; +pub use io_engine::feature_test as io_engine_feature_test; +pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult; +mod metadata; +mod open_options; +use self::owned_buffers_io::write::OwnedAsyncWriter; +pub(crate) use io_engine::IoEngineKind; +pub(crate) use metadata::Metadata; +pub(crate) use open_options::*; + +pub(crate) mod owned_buffers_io { + //! Abstractions for IO with owned buffers. + //! + //! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary + //! reason we need this abstraction. + //! + //! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`, + //! but for the time being we're proving out the primitives in the neon.git repo + //! for faster iteration. + + pub(crate) mod write; + pub(crate) mod util { + pub(crate) mod size_tracking_writer; + } +} /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally @@ -59,6 +92,7 @@ pub struct VirtualFile { // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into // strings. tenant_id: String, + shard_id: String, timeline_id: String, } @@ -103,7 +137,38 @@ struct SlotInner { tag: u64, /// the underlying file - file: Option, + file: Option, +} + +/// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`]. +struct PageWriteGuardBuf { + page: PageWriteGuard<'static>, + init_up_to: usize, +} +// Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot, +// and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved. +unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf { + fn stable_ptr(&self) -> *const u8 { + self.page.as_ptr() + } + fn bytes_init(&self) -> usize { + self.init_up_to + } + fn bytes_total(&self) -> usize { + self.page.len() + } +} +// Safety: see above, plus: the ownership of [`PageWriteGuard`] means exclusive access, +// hence it's safe to hand out the `stable_mut_ptr()`. +unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf { + fn stable_mut_ptr(&mut self) -> *mut u8 { + self.page.as_mut_ptr() + } + + unsafe fn set_init(&mut self, pos: usize) { + assert!(pos <= self.page.len()); + self.init_up_to = pos; + } } impl OpenFiles { @@ -111,7 +176,7 @@ impl OpenFiles { /// /// On return, we hold a lock on the slot, and its 'tag' has been updated /// recently_used has been set. It's all ready for reuse. - fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard) { + async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard) { // // Run the clock algorithm to find a slot to replace. // @@ -143,7 +208,7 @@ impl OpenFiles { } retries += 1; } else { - slot_guard = slot.inner.write().unwrap(); + slot_guard = slot.inner.write().await; index = next; break; } @@ -250,18 +315,52 @@ impl MaybeFatalIo for std::io::Result { } } +/// Observe duration for the given storage I/O operation +/// +/// Unlike `observe_closure_duration`, this supports async, +/// where "support" means that we measure wall clock time. +macro_rules! observe_duration { + ($op:expr, $($body:tt)*) => {{ + let instant = Instant::now(); + let result = $($body)*; + let elapsed = instant.elapsed().as_secs_f64(); + STORAGE_IO_TIME_METRIC + .get($op) + .observe(elapsed); + result + }} +} + +macro_rules! with_file { + ($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{ + let $ident = $this.lock_file().await?; + observe_duration!($op, $($body)*) + }}; + ($this:expr, $op:expr, | mut $ident:ident | $($body:tt)*) => {{ + let mut $ident = $this.lock_file().await?; + observe_duration!($op, $($body)*) + }}; +} + impl VirtualFile { /// Open a file in read-only mode. Like File::open. - pub async fn open(path: &Utf8Path) -> Result { - Self::open_with_options(path, OpenOptions::new().read(true)).await + pub async fn open>( + path: P, + ctx: &RequestContext, + ) -> Result { + Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await } /// Create a new file for writing. If the file exists, it will be truncated. /// Like File::create. - pub async fn create(path: &Utf8Path) -> Result { + pub async fn create>( + path: P, + ctx: &RequestContext, + ) -> Result { Self::open_with_options( - path, + path.as_ref(), OpenOptions::new().write(true).create(true).truncate(true), + ctx, ) .await } @@ -271,29 +370,40 @@ impl VirtualFile { /// Note: If any custom flags were set in 'open_options' through OpenOptionsExt, /// they will be applied also when the file is subsequently re-opened, not only /// on the first time. Make sure that's sane! - pub async fn open_with_options( - path: &Utf8Path, + pub async fn open_with_options>( + path: P, open_options: &OpenOptions, + _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */ ) -> Result { - let path_str = path.to_string(); + let path_ref = path.as_ref(); + let path_str = path_ref.to_string(); let parts = path_str.split('/').collect::>(); - let tenant_id; - let timeline_id; - if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME { - tenant_id = parts[parts.len() - 4].to_string(); - timeline_id = parts[parts.len() - 2].to_string(); - } else { - tenant_id = "*".to_string(); - timeline_id = "*".to_string(); - } - let (handle, mut slot_guard) = get_open_files().find_victim_slot(); + let (tenant_id, shard_id, timeline_id) = + if parts.len() > 5 && parts[parts.len() - 5] == TENANTS_SEGMENT_NAME { + let tenant_shard_part = parts[parts.len() - 4]; + let (tenant_id, shard_id) = match tenant_shard_part.parse::() { + Ok(tenant_shard_id) => ( + tenant_shard_id.tenant_id.to_string(), + format!("{}", tenant_shard_id.shard_slug()), + ), + Err(_) => { + // Malformed path: this ID is just for observability, so tolerate it + // and pass through + (tenant_shard_part.to_string(), "*".to_string()) + } + }; + (tenant_id, shard_id, parts[parts.len() - 2].to_string()) + } else { + ("*".to_string(), "*".to_string(), "*".to_string()) + }; + let (handle, mut slot_guard) = get_open_files().find_victim_slot().await; // NB: there is also StorageIoOperation::OpenAfterReplace which is for the case // where our caller doesn't get to use the returned VirtualFile before its // slot gets re-used by someone else. - let file = STORAGE_IO_TIME_METRIC - .get(StorageIoOperation::Open) - .observe_closure_duration(|| open_options.open(path))?; + let file = observe_duration!(StorageIoOperation::Open, { + open_options.open(path_ref.as_std_path()).await? + }); // Strip all options other than read and write. // @@ -308,9 +418,10 @@ impl VirtualFile { let vfile = VirtualFile { handle: RwLock::new(handle), pos: 0, - path: path.to_path_buf(), + path: path_ref.to_path_buf(), open_options: reopen_options, tenant_id, + shard_id, timeline_id, }; @@ -322,66 +433,66 @@ impl VirtualFile { Ok(vfile) } - /// Writes a file to the specified `final_path` in a crash safe fasion + /// Async version of [`::utils::crashsafe::overwrite`]. /// - /// The file is first written to the specified tmp_path, and in a second - /// step, the tmp path is renamed to the final path. As renames are - /// atomic, a crash during the write operation will never leave behind a - /// partially written file. - pub async fn crashsafe_overwrite( - final_path: &Utf8Path, - tmp_path: &Utf8Path, - content: &[u8], + /// # NB: + /// + /// Doesn't actually use the [`VirtualFile`] file descriptor cache, but, + /// it did at an earlier time. + /// And it will use this module's [`io_engine`] in the near future, so, leaving it here. + pub async fn crashsafe_overwrite + Send, Buf: IoBuf + Send>( + final_path: Utf8PathBuf, + tmp_path: Utf8PathBuf, + content: B, ) -> std::io::Result<()> { - let Some(final_path_parent) = final_path.parent() else { - return Err(std::io::Error::from_raw_os_error( - nix::errno::Errno::EINVAL as i32, - )); - }; - std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?; - let mut file = Self::open_with_options( - tmp_path, - OpenOptions::new() - .write(true) - // Use `create_new` so that, if we race with ourselves or something else, - // we bail out instead of causing damage. - .create_new(true), - ) - .await?; - file.write_all(content).await?; - file.sync_all().await?; - drop(file); // before the rename, that's important! - // renames are atomic - std::fs::rename(tmp_path, final_path)?; - // Only open final path parent dirfd now, so that this operation only - // ever holds one VirtualFile fd at a time. That's important because - // the current `find_victim_slot` impl might pick the same slot for both - // VirtualFile., and it eventually does a blocking write lock instead of - // try_lock. - let final_parent_dirfd = - Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?; - final_parent_dirfd.sync_all().await?; - Ok(()) + // TODO: use tokio_epoll_uring if configured as `io_engine`. + // See https://github.com/neondatabase/neon/issues/6663 + + tokio::task::spawn_blocking(move || { + let slice_storage; + let content_len = content.bytes_init(); + let content = if content.bytes_init() > 0 { + slice_storage = Some(content.slice(0..content_len)); + slice_storage.as_deref().expect("just set it to Some()") + } else { + &[] + }; + utils::crashsafe::overwrite(&final_path, &tmp_path, content) + }) + .await + .expect("blocking task is never aborted") } /// Call File::sync_all() on the underlying File. pub async fn sync_all(&self) -> Result<(), Error> { - self.with_file(StorageIoOperation::Fsync, |file| file.sync_all()) - .await? + with_file!(self, StorageIoOperation::Fsync, |file_guard| { + let (_file_guard, res) = io_engine::get().sync_all(file_guard).await; + res + }) } - pub async fn metadata(&self) -> Result { - self.with_file(StorageIoOperation::Metadata, |file| file.metadata()) - .await? + /// Call File::sync_data() on the underlying File. + pub async fn sync_data(&self) -> Result<(), Error> { + with_file!(self, StorageIoOperation::Fsync, |file_guard| { + let (_file_guard, res) = io_engine::get().sync_data(file_guard).await; + res + }) } - /// Helper function that looks up the underlying File for this VirtualFile, - /// opening it and evicting some other File if necessary. It calls 'func' - /// with the physical File. - async fn with_file(&self, op: StorageIoOperation, mut func: F) -> Result - where - F: FnMut(&File) -> R, - { + pub async fn metadata(&self) -> Result { + with_file!(self, StorageIoOperation::Metadata, |file_guard| { + let (_file_guard, res) = io_engine::get().metadata(file_guard).await; + res + }) + } + + /// Helper function internal to `VirtualFile` that looks up the underlying File, + /// opens it and evicts some other File if necessary. The passed parameter is + /// assumed to be a function available for the physical `File`. + /// + /// We are doing it via a macro as Rust doesn't support async closures that + /// take on parameters with lifetimes. + async fn lock_file(&self) -> Result { let open_files = get_open_files(); let mut handle_guard = { @@ -391,27 +502,23 @@ impl VirtualFile { // We only need to hold the handle lock while we read the current handle. If // another thread closes the file and recycles the slot for a different file, // we will notice that the handle we read is no longer valid and retry. - let mut handle = *self.handle.read().unwrap(); + let mut handle = *self.handle.read().await; loop { // Check if the slot contains our File { let slot = &open_files.slots[handle.index]; - let slot_guard = slot.inner.read().unwrap(); - if slot_guard.tag == handle.tag { - if let Some(file) = &slot_guard.file { - // Found a cached file descriptor. - slot.recently_used.store(true, Ordering::Relaxed); - return Ok(STORAGE_IO_TIME_METRIC - .get(op) - .observe_closure_duration(|| func(file))); - } + let slot_guard = slot.inner.read().await; + if slot_guard.tag == handle.tag && slot_guard.file.is_some() { + // Found a cached file descriptor. + slot.recently_used.store(true, Ordering::Relaxed); + return Ok(FileGuard { slot_guard }); } } // The slot didn't contain our File. We will have to open it ourselves, // but before that, grab a write lock on handle in the VirtualFile, so // that no other thread will try to concurrently open the same file. - let handle_guard = self.handle.write().unwrap(); + let handle_guard = self.handle.write().await; // If another thread changed the handle while we were not holding the lock, // then the handle might now be valid again. Loop back to retry. @@ -425,20 +532,15 @@ impl VirtualFile { // We need to open the file ourselves. The handle in the VirtualFile is // now locked in write-mode. Find a free slot to put it in. - let (handle, mut slot_guard) = open_files.find_victim_slot(); + let (handle, mut slot_guard) = open_files.find_victim_slot().await; // Re-open the physical file. // NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this // case from StorageIoOperation::Open. This helps with identifying thrashing // of the virtual file descriptor cache. - let file = STORAGE_IO_TIME_METRIC - .get(StorageIoOperation::OpenAfterReplace) - .observe_closure_duration(|| self.open_options.open(&self.path))?; - - // Perform the requested operation on it - let result = STORAGE_IO_TIME_METRIC - .get(op) - .observe_closure_duration(|| func(&file)); + let file = observe_duration!(StorageIoOperation::OpenAfterReplace, { + self.open_options.open(self.path.as_std_path()).await? + }); // Store the File in the slot and update the handle in the VirtualFile // to point to it. @@ -446,7 +548,9 @@ impl VirtualFile { *handle_guard = handle; - Ok(result) + return Ok(FileGuard { + slot_guard: slot_guard.downgrade(), + }); } pub fn remove(self) { @@ -461,11 +565,8 @@ impl VirtualFile { self.pos = offset; } SeekFrom::End(offset) => { - self.pos = self - .with_file(StorageIoOperation::Seek, |mut file| { - file.seek(SeekFrom::End(offset)) - }) - .await?? + self.pos = with_file!(self, StorageIoOperation::Seek, |mut file_guard| file_guard + .with_std_file_mut(|std_file| std_file.seek(SeekFrom::End(offset))))? } SeekFrom::Current(offset) => { let pos = self.pos as i128 + offset as i128; @@ -484,96 +585,461 @@ impl VirtualFile { Ok(self.pos) } - // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 - pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> { - while !buf.is_empty() { - match self.read_at(buf, offset).await { - Ok(0) => { - return Err(Error::new( - std::io::ErrorKind::UnexpectedEof, - "failed to fill whole buffer", - )) - } - Ok(n) => { - buf = &mut buf[n..]; - offset += n as u64; - } - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), - } - } - Ok(()) + pub async fn read_exact_at( + &self, + buf: B, + offset: u64, + ctx: &RequestContext, + ) -> Result + where + B: IoBufMut + Send, + { + let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| { + self.read_at(buf, offset, ctx) + }) + .await; + res.map(|()| buf) + } + + pub async fn read_exact_at_n( + &self, + buf: B, + offset: u64, + count: usize, + ctx: &RequestContext, + ) -> Result + where + B: IoBufMut + Send, + { + let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| { + self.read_at(buf, offset, ctx) + }) + .await; + res.map(|()| buf) + } + + /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`]. + pub async fn read_exact_at_page( + &self, + page: PageWriteGuard<'static>, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> { + let buf = PageWriteGuardBuf { + page, + init_up_to: 0, + }; + let res = self.read_exact_at(buf, offset, ctx).await; + res.map(|PageWriteGuardBuf { page, .. }| page) + .map_err(|e| Error::new(ErrorKind::Other, e)) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 - pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> { + pub async fn write_all_at, Buf: IoBuf + Send>( + &self, + buf: B, + mut offset: u64, + ctx: &RequestContext, + ) -> (B::Buf, Result<(), Error>) { + let buf_len = buf.bytes_init(); + if buf_len == 0 { + return (Slice::into_inner(buf.slice_full()), Ok(())); + } + let mut buf = buf.slice(0..buf_len); while !buf.is_empty() { - match self.write_at(buf, offset).await { + let res; + (buf, res) = self.write_at(buf, offset, ctx).await; + match res { Ok(0) => { - return Err(Error::new( - std::io::ErrorKind::WriteZero, - "failed to write whole buffer", - )); + return ( + Slice::into_inner(buf), + Err(Error::new( + std::io::ErrorKind::WriteZero, + "failed to write whole buffer", + )), + ); } Ok(n) => { - buf = &buf[n..]; + buf = buf.slice(n..); offset += n as u64; } - Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), + Err(e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return (Slice::into_inner(buf), Err(e)), } } - Ok(()) + (Slice::into_inner(buf), Ok(())) } - pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> { + /// Writes `buf.slice(0..buf.bytes_init())`. + /// Returns the IoBuf that is underlying the BoundedBuf `buf`. + /// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in. + /// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant. + pub async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> (B::Buf, Result) { + let nbytes = buf.bytes_init(); + if nbytes == 0 { + return (Slice::into_inner(buf.slice_full()), Ok(0)); + } + let mut buf = buf.slice(0..nbytes); while !buf.is_empty() { - match self.write(buf).await { + let res; + (buf, res) = self.write(buf, ctx).await; + match res { Ok(0) => { - return Err(Error::new( - std::io::ErrorKind::WriteZero, - "failed to write whole buffer", - )); + return ( + Slice::into_inner(buf), + Err(Error::new( + std::io::ErrorKind::WriteZero, + "failed to write whole buffer", + )), + ); } Ok(n) => { - buf = &buf[n..]; + buf = buf.slice(n..); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} - Err(e) => return Err(e), + Err(e) => return (Slice::into_inner(buf), Err(e)), } } - Ok(()) + (Slice::into_inner(buf), Ok(nbytes)) } - async fn write(&mut self, buf: &[u8]) -> Result { + async fn write( + &mut self, + buf: Slice, + ctx: &RequestContext, + ) -> (Slice, Result) { let pos = self.pos; - let n = self.write_at(buf, pos).await?; + let (buf, res) = self.write_at(buf, pos, ctx).await; + let n = match res { + Ok(n) => n, + Err(e) => return (buf, Err(e)), + }; self.pos += n as u64; - Ok(n) + (buf, Ok(n)) } - pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result { - let result = self - .with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset)) - .await?; - if let Ok(size) = result { - STORAGE_IO_SIZE - .with_label_values(&["read", &self.tenant_id, &self.timeline_id]) - .add(size as i64); - } - result + pub(crate) async fn read_at( + &self, + buf: B, + offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ) -> (B, Result) + where + B: tokio_epoll_uring::BoundedBufMut + Send, + { + let file_guard = match self.lock_file().await { + Ok(file_guard) => file_guard, + Err(e) => return (buf, Err(e)), + }; + + observe_duration!(StorageIoOperation::Read, { + let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await; + if let Ok(size) = res { + STORAGE_IO_SIZE + .with_label_values(&[ + "read", + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + ]) + .add(size as i64); + } + (buf, res) + }) } - async fn write_at(&self, buf: &[u8], offset: u64) -> Result { - let result = self - .with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset)) - .await?; - if let Ok(size) = result { - STORAGE_IO_SIZE - .with_label_values(&["write", &self.tenant_id, &self.timeline_id]) - .add(size as i64); + async fn write_at( + &self, + buf: Slice, + offset: u64, + _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ + ) -> (Slice, Result) { + let file_guard = match self.lock_file().await { + Ok(file_guard) => file_guard, + Err(e) => return (buf, Err(e)), + }; + observe_duration!(StorageIoOperation::Write, { + let ((_file_guard, buf), result) = + io_engine::get().write_at(file_guard, offset, buf).await; + if let Ok(size) = result { + STORAGE_IO_SIZE + .with_label_values(&[ + "write", + &self.tenant_id, + &self.shard_id, + &self.timeline_id, + ]) + .add(size as i64); + } + (buf, result) + }) + } +} + +// Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 +pub async fn read_exact_at_impl( + buf: B, + mut offset: u64, + count: Option, + mut read_at: F, +) -> (B, std::io::Result<()>) +where + B: IoBufMut + Send, + F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, + Fut: std::future::Future, std::io::Result)>, +{ + let mut buf: tokio_epoll_uring::Slice = match count { + Some(count) => { + assert!(count <= buf.bytes_total()); + assert!(count > 0); + buf.slice(..count) // may include uninitialized memory } - result + None => buf.slice_full(), // includes all the uninitialized memory + }; + + while buf.bytes_total() != 0 { + let res; + (buf, res) = read_at(buf, offset).await; + match res { + Ok(0) => break, + Ok(n) => { + buf = buf.slice(n..); + offset += n as u64; + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return (buf.into_inner(), Err(e)), + } + } + // NB: don't use `buf.is_empty()` here; it is from the + // `impl Deref for Slice { Target = [u8] }`; the &[u8] + // returned by it only covers the initialized portion of `buf`. + // Whereas we're interested in ensuring that we filled the entire + // buffer that the user passed in. + if buf.bytes_total() != 0 { + ( + buf.into_inner(), + Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "failed to fill whole buffer", + )), + ) + } else { + assert_eq!(buf.len(), buf.bytes_total()); + (buf.into_inner(), Ok(())) + } +} + +#[cfg(test)] +mod test_read_exact_at_impl { + + use std::{collections::VecDeque, sync::Arc}; + + use tokio_epoll_uring::{BoundedBuf, BoundedBufMut}; + + use super::read_exact_at_impl; + + struct Expectation { + offset: u64, + bytes_total: usize, + result: std::io::Result>, + } + struct MockReadAt { + expectations: VecDeque, + } + + impl MockReadAt { + async fn read_at( + &mut self, + mut buf: tokio_epoll_uring::Slice>, + offset: u64, + ) -> (tokio_epoll_uring::Slice>, std::io::Result) { + let exp = self + .expectations + .pop_front() + .expect("read_at called but we have no expectations left"); + assert_eq!(exp.offset, offset); + assert_eq!(exp.bytes_total, buf.bytes_total()); + match exp.result { + Ok(bytes) => { + assert!(bytes.len() <= buf.bytes_total()); + buf.put_slice(&bytes); + (buf, Ok(bytes.len())) + } + Err(e) => (buf, Err(e)), + } + } + } + + impl Drop for MockReadAt { + fn drop(&mut self) { + assert_eq!(self.expectations.len(), 0); + } + } + + #[tokio::test] + async fn test_basic() { + let buf = Vec::with_capacity(5); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![Expectation { + offset: 0, + bytes_total: 5, + result: Ok(vec![b'a', b'b', b'c', b'd', b'e']), + }]), + })); + let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); + } + + #[tokio::test] + async fn test_with_count() { + let buf = Vec::with_capacity(5); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![Expectation { + offset: 0, + bytes_total: 3, + result: Ok(vec![b'a', b'b', b'c']), + }]), + })); + + let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + assert_eq!(buf, vec![b'a', b'b', b'c']); + } + + #[tokio::test] + async fn test_empty_buf_issues_no_syscall() { + let buf = Vec::new(); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::new(), + })); + let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + } + + #[tokio::test] + async fn test_two_read_at_calls_needed_until_buf_filled() { + let buf = Vec::with_capacity(4); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![ + Expectation { + offset: 0, + bytes_total: 4, + result: Ok(vec![b'a', b'b']), + }, + Expectation { + offset: 2, + bytes_total: 2, + result: Ok(vec![b'c', b'd']), + }, + ]), + })); + let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + assert_eq!(buf, vec![b'a', b'b', b'c', b'd']); + } + + #[tokio::test] + async fn test_eof_before_buffer_full() { + let buf = Vec::with_capacity(3); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![ + Expectation { + offset: 0, + bytes_total: 3, + result: Ok(vec![b'a']), + }, + Expectation { + offset: 1, + bytes_total: 2, + result: Ok(vec![b'b']), + }, + Expectation { + offset: 2, + bytes_total: 1, + result: Ok(vec![]), + }, + ]), + })); + let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + let Err(err) = res else { + panic!("should return an error"); + }; + assert_eq!(err.kind(), std::io::ErrorKind::UnexpectedEof); + assert_eq!(format!("{err}"), "failed to fill whole buffer"); + // buffer contents on error are unspecified + } +} + +struct FileGuard { + slot_guard: RwLockReadGuard<'static, SlotInner>, +} + +impl AsRef for FileGuard { + fn as_ref(&self) -> &OwnedFd { + // This unwrap is safe because we only create `FileGuard`s + // if we know that the file is Some. + self.slot_guard.file.as_ref().unwrap() + } +} + +impl FileGuard { + /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually. + fn with_std_file(&self, with: F) -> R + where + F: FnOnce(&File) -> R, + { + // SAFETY: + // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`. + // - `&` usage below: `self` is `&`, hence Rust typesystem guarantees there are is no `&mut` + let file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) }; + let res = with(&file); + let _ = file.into_raw_fd(); + res + } + /// Soft deprecation: we'll move VirtualFile to async APIs and remove this function eventually. + fn with_std_file_mut(&mut self, with: F) -> R + where + F: FnOnce(&mut File) -> R, + { + // SAFETY: + // - lifetime of the fd: `file` doesn't outlive the OwnedFd stored in `self`. + // - &mut usage below: `self` is `&mut`, hence this call is the only task/thread that has control over the underlying fd + let mut file = unsafe { File::from_raw_fd(self.as_ref().as_raw_fd()) }; + let res = with(&mut file); + let _ = file.into_raw_fd(); + res + } +} + +impl tokio_epoll_uring::IoFd for FileGuard { + unsafe fn as_fd(&self) -> RawFd { + let owned_fd: &OwnedFd = self.as_ref(); + owned_fd.as_raw_fd() } } @@ -582,18 +1048,22 @@ impl VirtualFile { pub(crate) async fn read_blk( &self, blknum: u32, + ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; - let mut buf = [0; PAGE_SZ]; - self.read_exact_at(&mut buf, blknum as u64 * (PAGE_SZ as u64)) + let buf = vec![0; PAGE_SZ]; + let buf = self + .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; - Ok(std::sync::Arc::new(buf).into()) + Ok(crate::tenant::block_io::BlockLease::Vec(buf)) } - async fn read_to_end(&mut self, buf: &mut Vec) -> Result<(), Error> { + async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { + let mut tmp = vec![0; 128]; loop { - let mut tmp = [0; 128]; - match self.read_at(&mut tmp, self.pos).await { + let res; + (tmp, res) = self.read_at(tmp, self.pos, ctx).await; + match res { Ok(0) => return Ok(()), Ok(n) => { self.pos += n as u64; @@ -609,22 +1079,53 @@ impl VirtualFile { impl Drop for VirtualFile { /// If a VirtualFile is dropped, close the underlying file if it was open. fn drop(&mut self) { - let handle = self.handle.get_mut().unwrap(); + let handle = self.handle.get_mut(); - // We could check with a read-lock first, to avoid waiting on an - // unrelated I/O. - let slot = &get_open_files().slots[handle.index]; - let mut slot_guard = slot.inner.write().unwrap(); - if slot_guard.tag == handle.tag { - slot.recently_used.store(false, Ordering::Relaxed); - // there is also operation "close-by-replace" for closes done on eviction for - // comparison. - if let Some(fd) = slot_guard.file.take() { - STORAGE_IO_TIME_METRIC - .get(StorageIoOperation::Close) - .observe_closure_duration(|| drop(fd)); + fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) { + if slot_guard.tag == tag { + slot.recently_used.store(false, Ordering::Relaxed); + // there is also operation "close-by-replace" for closes done on eviction for + // comparison. + if let Some(fd) = slot_guard.file.take() { + STORAGE_IO_TIME_METRIC + .get(StorageIoOperation::Close) + .observe_closure_duration(|| drop(fd)); + } } } + + // We don't have async drop so we cannot directly await the lock here. + // Instead, first do a best-effort attempt at closing the underlying + // file descriptor by using `try_write`, and if that fails, spawn + // a tokio task to do it asynchronously: we just want it to be + // cleaned up eventually. + // Most of the time, the `try_lock` should succeed though, + // as we have `&mut self` access. In other words, if the slot + // is still occupied by our file, there should be no access from + // other I/O operations; the only other possible place to lock + // the slot is the lock algorithm looking for free slots. + let slot = &get_open_files().slots[handle.index]; + if let Ok(slot_guard) = slot.inner.try_write() { + clean_slot(slot, slot_guard, handle.tag); + } else { + let tag = handle.tag; + tokio::spawn(async move { + let slot_guard = slot.inner.write().await; + clean_slot(slot, slot_guard, tag); + }); + }; + } +} + +impl OwnedAsyncWriter for VirtualFile { + #[inline(always)] + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)> { + let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; + res.map(move |v| (v, buf)) } } @@ -650,10 +1151,12 @@ impl OpenFiles { /// Initialize the virtual file module. This must be called once at page /// server startup. /// -pub fn init(num_slots: usize) { +#[cfg(not(test))] +pub fn init(num_slots: usize, engine: IoEngineKind) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } + io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } @@ -678,12 +1181,15 @@ fn get_open_files() -> &'static OpenFiles { #[cfg(test)] mod tests { + use crate::context::DownloadBehavior; + use crate::task_mgr::TaskKind; + use super::*; use rand::seq::SliceRandom; use rand::thread_rng; use rand::Rng; - use std::future::Future; use std::io::Write; + use std::os::unix::fs::FileExt; use std::sync::Arc; enum MaybeVirtualFile { @@ -698,16 +1204,35 @@ mod tests { } impl MaybeVirtualFile { - async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> { + async fn read_exact_at( + &self, + mut buf: Vec, + offset: u64, + ctx: &RequestContext, + ) -> Result, Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await, - MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset), + MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await, + MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), } } - async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> { + async fn write_all_at, Buf: IoBuf + Send>( + &self, + buf: B, + offset: u64, + ctx: &RequestContext, + ) -> Result<(), Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await, - MaybeVirtualFile::File(file) => file.write_all_at(buf, offset), + MaybeVirtualFile::VirtualFile(file) => { + let (_buf, res) = file.write_all_at(buf, offset, ctx).await; + res + } + MaybeVirtualFile::File(file) => { + let buf_len = buf.bytes_init(); + if buf_len == 0 { + return Ok(()); + } + file.write_all_at(&buf.slice(0..buf_len), offset) + } } } async fn seek(&mut self, pos: SeekFrom) -> Result { @@ -716,22 +1241,35 @@ mod tests { MaybeVirtualFile::File(file) => file.seek(pos), } } - async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> Result<(), Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await, - MaybeVirtualFile::File(file) => file.write_all(buf), + MaybeVirtualFile::VirtualFile(file) => { + let (_buf, res) = file.write_all(buf, ctx).await; + res.map(|_| ()) + } + MaybeVirtualFile::File(file) => { + let buf_len = buf.bytes_init(); + if buf_len == 0 { + return Ok(()); + } + file.write_all(&buf.slice(0..buf_len)) + } } } // Helper function to slurp contents of a file, starting at the current position, // into a string - async fn read_string(&mut self) -> Result { + async fn read_string(&mut self, ctx: &RequestContext) -> Result { use std::io::Read; let mut buf = String::new(); match self { MaybeVirtualFile::VirtualFile(file) => { let mut buf = Vec::new(); - file.read_to_end(&mut buf).await?; + file.read_to_end(&mut buf, ctx).await?; return Ok(String::from_utf8(buf).unwrap()); } MaybeVirtualFile::File(file) => { @@ -742,15 +1280,20 @@ mod tests { } // Helper function to slurp a portion of a file into a string - async fn read_string_at(&mut self, pos: u64, len: usize) -> Result { - let mut buf = vec![0; len]; - self.read_exact_at(&mut buf, pos).await?; + async fn read_string_at( + &mut self, + pos: u64, + len: usize, + ctx: &RequestContext, + ) -> Result { + let buf = vec![0; len]; + let buf = self.read_exact_at(buf, pos, ctx).await?; Ok(String::from_utf8(buf).unwrap()) } } #[tokio::test] - async fn test_virtual_files() -> Result<(), Error> { + async fn test_virtual_files() -> anyhow::Result<()> { // The real work is done in the test_files() helper function. This // allows us to run the same set of tests against a native File, and // VirtualFile. We trust the native Files and wouldn't need to test them, @@ -758,69 +1301,101 @@ mod tests { // results with VirtualFiles as with native Files. (Except that with // native files, you will run out of file descriptors if the ulimit // is low enough.) - test_files("virtual_files", |path, open_options| async move { - let vf = VirtualFile::open_with_options(&path, &open_options).await?; - Ok(MaybeVirtualFile::VirtualFile(vf)) - }) - .await + struct A; + + impl Adapter for A { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + ctx: &RequestContext, + ) -> Result { + let vf = VirtualFile::open_with_options(&path, &opts, ctx).await?; + Ok(MaybeVirtualFile::VirtualFile(vf)) + } + } + test_files::("virtual_files").await } #[tokio::test] - async fn test_physical_files() -> Result<(), Error> { - test_files("physical_files", |path, open_options| async move { - Ok(MaybeVirtualFile::File(open_options.open(path)?)) - }) - .await + async fn test_physical_files() -> anyhow::Result<()> { + struct B; + + impl Adapter for B { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + _ctx: &RequestContext, + ) -> Result { + Ok(MaybeVirtualFile::File({ + let owned_fd = opts.open(path.as_std_path()).await?; + File::from(owned_fd) + })) + } + } + + test_files::("physical_files").await } - async fn test_files(testname: &str, openfunc: OF) -> Result<(), Error> + /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition + /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function + /// in trait which benefits from the new lifetime capture rules already. + trait Adapter { + async fn open( + path: Utf8PathBuf, + opts: OpenOptions, + ctx: &RequestContext, + ) -> Result; + } + + async fn test_files(testname: &str) -> anyhow::Result<()> where - OF: Fn(Utf8PathBuf, OpenOptions) -> FT, - FT: Future>, + A: Adapter, { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir(testname); std::fs::create_dir_all(&testdir)?; let path_a = testdir.join("file_a"); - let mut file_a = openfunc( + let mut file_a = A::open( path_a.clone(), OpenOptions::new() .write(true) .create(true) .truncate(true) .to_owned(), + &ctx, ) .await?; - file_a.write_all(b"foobar").await?; + file_a.write_all(b"foobar".to_vec(), &ctx).await?; // cannot read from a file opened in write-only mode - let _ = file_a.read_string().await.unwrap_err(); + let _ = file_a.read_string(&ctx).await.unwrap_err(); // Close the file and re-open for reading - let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?; + let mut file_a = A::open(path_a, OpenOptions::new().read(true).to_owned(), &ctx).await?; // cannot write to a file opened in read-only mode - let _ = file_a.write_all(b"bar").await.unwrap_err(); + let _ = file_a.write_all(b"bar".to_vec(), &ctx).await.unwrap_err(); // Try simple read - assert_eq!("foobar", file_a.read_string().await?); + assert_eq!("foobar", file_a.read_string(&ctx).await?); // It's positioned at the EOF now. - assert_eq!("", file_a.read_string().await?); + assert_eq!("", file_a.read_string(&ctx).await?); // Test seeks. assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::End(-2)).await?, 4); - assert_eq!("ar", file_a.read_string().await?); + assert_eq!("ar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::Start(1)).await?, 1); assert_eq!(file_a.seek(SeekFrom::Current(2)).await?, 3); - assert_eq!("bar", file_a.read_string().await?); + assert_eq!("bar", file_a.read_string(&ctx).await?); assert_eq!(file_a.seek(SeekFrom::Current(-5)).await?, 1); - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Test erroneous seeks to before byte 0 file_a.seek(SeekFrom::End(-7)).await.unwrap_err(); @@ -828,11 +1403,11 @@ mod tests { file_a.seek(SeekFrom::Current(-2)).await.unwrap_err(); // the erroneous seek should have left the position unchanged - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); - let mut file_b = openfunc( + let mut file_b = A::open( path_b.clone(), OpenOptions::new() .read(true) @@ -840,12 +1415,13 @@ mod tests { .create(true) .truncate(true) .to_owned(), + &ctx, ) .await?; - file_b.write_all_at(b"BAR", 3).await?; - file_b.write_all_at(b"FOO", 0).await?; + file_b.write_all_at(b"BAR".to_vec(), 3, &ctx).await?; + file_b.write_all_at(b"FOO".to_vec(), 0, &ctx).await?; - assert_eq!(file_b.read_string_at(2, 3).await?, "OBA"); + assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); // Open a lot of files, enough to cause some evictions. (Or to be precise, // open the same file many times. The effect is the same.) @@ -855,9 +1431,13 @@ mod tests { let mut vfiles = Vec::new(); for _ in 0..100 { - let mut vfile = - openfunc(path_b.clone(), OpenOptions::new().read(true).to_owned()).await?; - assert_eq!("FOOBAR", vfile.read_string().await?); + let mut vfile = A::open( + path_b.clone(), + OpenOptions::new().read(true).to_owned(), + &ctx, + ) + .await?; + assert_eq!("FOOBAR", vfile.read_string(&ctx).await?); vfiles.push(vfile); } @@ -866,13 +1446,13 @@ mod tests { // The underlying file descriptor for 'file_a' should be closed now. Try to read // from it again. We left the file positioned at offset 1 above. - assert_eq!("oobar", file_a.read_string().await?); + assert_eq!("oobar", file_a.read_string(&ctx).await?); // Check that all the other FDs still work too. Use them in random order for // good measure. vfiles.as_mut_slice().shuffle(&mut thread_rng()); for vfile in vfiles.iter_mut() { - assert_eq!("OOBAR", vfile.read_string_at(1, 5).await?); + assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?); } Ok(()) @@ -888,6 +1468,7 @@ mod tests { const THREADS: usize = 100; const SAMPLE: [u8; SIZE] = [0xADu8; SIZE]; + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency"); std::fs::create_dir_all(&testdir)?; @@ -901,8 +1482,12 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true)) - .await?; + let f = VirtualFile::open_with_options( + &test_file_path, + OpenOptions::new().read(true), + &ctx, + ) + .await?; files.push(f); } let files = Arc::new(files); @@ -916,12 +1501,13 @@ mod tests { let mut hdls = Vec::new(); for _threadno in 0..THREADS { let files = files.clone(); + let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error); let hdl = rt.spawn(async move { - let mut buf = [0u8; SIZE]; + let mut buf = vec![0u8; SIZE]; let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; - f.read_exact_at(&mut buf, 0).await.unwrap(); + buf = f.read_exact_at(buf, 0, &ctx).await.unwrap(); assert!(buf == SAMPLE); } }); @@ -937,26 +1523,27 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); let path = testdir.join("myfile"); let tmp_path = testdir.join("myfile.tmp"); - VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo") + VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); drop(file); - VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar") + VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); drop(file); @@ -964,6 +1551,7 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); @@ -974,12 +1562,12 @@ mod tests { std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap(); assert!(tmp_path.exists()); - VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo") + VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap()); - let post = file.read_string().await.unwrap(); + let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); + let post = file.read_string(&ctx).await.unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); drop(file); diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs new file mode 100644 index 0000000000..7a27be2ca1 --- /dev/null +++ b/pageserver/src/virtual_file/io_engine.rs @@ -0,0 +1,339 @@ +//! [`super::VirtualFile`] supports different IO engines. +//! +//! The [`IoEngineKind`] enum identifies them. +//! +//! The choice of IO engine is global. +//! Initialize using [`init`]. +//! +//! Then use [`get`] and [`super::OpenOptions`]. +//! +//! + +#[cfg(target_os = "linux")] +pub(super) mod tokio_epoll_uring_ext; + +use tokio_epoll_uring::{IoBuf, Slice}; +use tracing::Instrument; + +pub(crate) use super::api::IoEngineKind; +#[derive(Clone, Copy)] +#[repr(u8)] +pub(crate) enum IoEngine { + NotSet, + StdFs, + #[cfg(target_os = "linux")] + TokioEpollUring, +} + +impl From for IoEngine { + fn from(value: IoEngineKind) -> Self { + match value { + IoEngineKind::StdFs => IoEngine::StdFs, + #[cfg(target_os = "linux")] + IoEngineKind::TokioEpollUring => IoEngine::TokioEpollUring, + } + } +} + +impl TryFrom for IoEngine { + type Error = u8; + + fn try_from(value: u8) -> Result { + Ok(match value { + v if v == (IoEngine::NotSet as u8) => IoEngine::NotSet, + v if v == (IoEngine::StdFs as u8) => IoEngine::StdFs, + #[cfg(target_os = "linux")] + v if v == (IoEngine::TokioEpollUring as u8) => IoEngine::TokioEpollUring, + x => return Err(x), + }) + } +} + +static IO_ENGINE: AtomicU8 = AtomicU8::new(IoEngine::NotSet as u8); + +pub(crate) fn set(engine_kind: IoEngineKind) { + let engine: IoEngine = engine_kind.into(); + IO_ENGINE.store(engine as u8, std::sync::atomic::Ordering::Relaxed); + #[cfg(not(test))] + { + let metric = &crate::metrics::virtual_file_io_engine::KIND; + metric.reset(); + metric + .with_label_values(&[&format!("{engine_kind}")]) + .set(1); + } +} + +#[cfg(not(test))] +pub(super) fn init(engine_kind: IoEngineKind) { + set(engine_kind); +} + +/// Longer-term, this API should only be used by [`super::VirtualFile`]. +pub(crate) fn get() -> IoEngine { + let cur = IoEngine::try_from(IO_ENGINE.load(Ordering::Relaxed)).unwrap(); + if cfg!(test) { + let env_var_name = "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE"; + match cur { + IoEngine::NotSet => { + let kind = match std::env::var(env_var_name) { + Ok(v) => match v.parse::() { + Ok(engine_kind) => engine_kind, + Err(e) => { + panic!("invalid VirtualFile io engine for env var {env_var_name}: {e:#}: {v:?}") + } + }, + Err(std::env::VarError::NotPresent) => { + crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE + .parse() + .unwrap() + } + Err(std::env::VarError::NotUnicode(_)) => { + panic!("env var {env_var_name} is not unicode"); + } + }; + self::set(kind); + self::get() + } + x => x, + } + } else { + cur + } +} + +use std::{ + os::unix::prelude::FileExt, + sync::atomic::{AtomicU8, Ordering}, +}; + +use super::{FileGuard, Metadata}; + +#[cfg(target_os = "linux")] +fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { + match e { + tokio_epoll_uring::Error::Op(e) => e, + tokio_epoll_uring::Error::System(system) => { + std::io::Error::new(std::io::ErrorKind::Other, system) + } + } +} + +impl IoEngine { + pub(super) async fn read_at( + &self, + file_guard: FileGuard, + offset: u64, + mut buf: B, + ) -> ((FileGuard, B), std::io::Result) + where + B: tokio_epoll_uring::BoundedBufMut + Send, + { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory. + let dst = unsafe { + std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total()) + }; + let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset)); + if let Ok(nbytes) = &res { + assert!(*nbytes <= buf.bytes_total()); + // SAFETY: see above assertion + unsafe { + buf.set_init(*nbytes); + } + } + #[allow(dropping_references)] + drop(dst); + ((file_guard, buf), res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let (resources, res) = system.read(file_guard, offset, buf).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn sync_all(&self, file_guard: FileGuard) -> (FileGuard, std::io::Result<()>) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = file_guard.with_std_file(|std_file| std_file.sync_all()); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let (resources, res) = system.fsync(file_guard).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn sync_data( + &self, + file_guard: FileGuard, + ) -> (FileGuard, std::io::Result<()>) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = file_guard.with_std_file(|std_file| std_file.sync_data()); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let (resources, res) = system.fdatasync(file_guard).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + pub(super) async fn metadata( + &self, + file_guard: FileGuard, + ) -> (FileGuard, std::io::Result) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let res = + file_guard.with_std_file(|std_file| std_file.metadata().map(Metadata::from)); + (file_guard, res) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let (resources, res) = system.statx(file_guard).await; + ( + resources, + res.map_err(epoll_uring_error_to_std).map(Metadata::from), + ) + } + } + } + pub(super) async fn write_at( + &self, + file_guard: FileGuard, + offset: u64, + buf: Slice, + ) -> ((FileGuard, Slice), std::io::Result) { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let result = file_guard.with_std_file(|std_file| std_file.write_at(&buf, offset)); + ((file_guard, buf), result) + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + let system = tokio_epoll_uring_ext::thread_local_system().await; + let (resources, res) = system.write(file_guard, offset, buf).await; + (resources, res.map_err(epoll_uring_error_to_std)) + } + } + } + + /// If we switch a user of [`tokio::fs`] to use [`super::io_engine`], + /// they'd start blocking the executor thread if [`IoEngine::StdFs`] is configured + /// whereas before the switch to [`super::io_engine`], that wasn't the case. + /// This method helps avoid such a regression. + /// + /// Panics if the `spawn_blocking` fails, see [`tokio::task::JoinError`] for reasons why that can happen. + pub(crate) async fn spawn_blocking_and_block_on_if_std(&self, work: Fut) -> R + where + Fut: 'static + Send + std::future::Future, + R: 'static + Send, + { + match self { + IoEngine::NotSet => panic!("not initialized"), + IoEngine::StdFs => { + let span = tracing::info_span!("spawn_blocking_block_on_if_std"); + tokio::task::spawn_blocking({ + move || tokio::runtime::Handle::current().block_on(work.instrument(span)) + }) + .await + .expect("failed to join blocking code most likely it panicked, panicking as well") + } + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => work.await, + } + } +} + +pub enum FeatureTestResult { + PlatformPreferred(IoEngineKind), + Worse { + engine: IoEngineKind, + remark: String, + }, +} + +impl FeatureTestResult { + #[cfg(target_os = "linux")] + const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::TokioEpollUring; + #[cfg(not(target_os = "linux"))] + const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::StdFs; +} + +impl From for IoEngineKind { + fn from(val: FeatureTestResult) -> Self { + match val { + FeatureTestResult::PlatformPreferred(e) => e, + FeatureTestResult::Worse { engine, .. } => engine, + } + } +} + +/// Somewhat costly under the hood, do only once. +/// Panics if we can't set up the feature test. +pub fn feature_test() -> anyhow::Result { + std::thread::spawn(|| { + + #[cfg(not(target_os = "linux"))] + { + Ok(FeatureTestResult::PlatformPreferred( + FeatureTestResult::PLATFORM_PREFERRED, + )) + } + #[cfg(target_os = "linux")] + { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + Ok(match rt.block_on(tokio_epoll_uring::System::launch()) { + Ok(_) => FeatureTestResult::PlatformPreferred({ + assert!(matches!( + IoEngineKind::TokioEpollUring, + FeatureTestResult::PLATFORM_PREFERRED + )); + FeatureTestResult::PLATFORM_PREFERRED + }), + Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => { + let remark = match e.raw_os_error() { + Some(nix::libc::EPERM) => { + // fall back + "creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled " + .to_string() + } + Some(nix::libc::EFAULT) => { + // fail feature test + anyhow::bail!( + "creating tokio-epoll-uring fails with EFAULT, might have corrupted memory" + ); + } + Some(_) | None => { + // fall back + format!("creating tokio-epoll-uring fails with error: {e:#}") + } + }; + FeatureTestResult::Worse { + engine: IoEngineKind::StdFs, + remark, + } + } + }) + } + }) + .join() + .unwrap() +} diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs new file mode 100644 index 0000000000..6ea19d6b2d --- /dev/null +++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs @@ -0,0 +1,194 @@ +//! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific +//! handling in case the instance can't launched. +//! +//! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation +//! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series. +//! See for more details. + +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::sync::Arc; + +use tokio_util::sync::CancellationToken; +use tracing::{error, info, info_span, warn, Instrument}; +use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS}; + +use tokio_epoll_uring::{System, SystemHandle}; + +use crate::virtual_file::on_fatal_io_error; + +use crate::metrics::tokio_epoll_uring as metrics; + +#[derive(Clone)] +struct ThreadLocalState(Arc); + +struct ThreadLocalStateInner { + cell: tokio::sync::OnceCell, + launch_attempts: AtomicU32, + /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`] + thread_local_state_id: u64, +} + +impl ThreadLocalState { + pub fn new() -> Self { + Self(Arc::new(ThreadLocalStateInner { + cell: tokio::sync::OnceCell::default(), + launch_attempts: AtomicU32::new(0), + thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed), + })) + } + + pub fn make_id_string(&self) -> String { + format!("{}", self.0.thread_local_state_id) + } +} + +static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0); + +thread_local! { + static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new(); +} + +/// Panics if we cannot [`System::launch`]. +pub async fn thread_local_system() -> Handle { + let fake_cancel = CancellationToken::new(); + loop { + let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone()); + let inner = &thread_local_state.0; + let get_or_init_res = inner + .cell + .get_or_try_init(|| async { + let attempt_no = inner + .launch_attempts + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no); + async { + // Rate-limit retries per thread-local. + // NB: doesn't yield to executor at attempt_no=0. + utils::backoff::exponential_backoff( + attempt_no, + DEFAULT_BASE_BACKOFF_SECONDS, + DEFAULT_MAX_BACKOFF_SECONDS, + &fake_cancel, + ) + .await; + let res = System::launch() + // this might move us to another executor thread => loop outside the get_or_try_init, not inside it + .await; + match res { + Ok(system) => { + info!("successfully launched system"); + metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc(); + Ok(system) + } + Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => { + warn!("not enough locked memory to tokio-epoll-uring, will retry"); + info_span!("stats").in_scope(|| { + emit_launch_failure_process_stats(); + }); + metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc(); + Err(()) + } + // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere. + // This is equivalent to a fatal IO error. + Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => { + error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process"); + info_span!("stats").in_scope(|| { + emit_launch_failure_process_stats(); + }); + on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring"); + }, + } + } + .instrument(span) + .await + }) + .await; + if get_or_init_res.is_ok() { + return Handle(thread_local_state); + } + } +} + +fn emit_launch_failure_process_stats() { + // tokio-epoll-uring stats + // vmlck + rlimit + // number of threads + // rss / system memory usage generally + + let tokio_epoll_uring::metrics::Metrics { + systems_created, + systems_destroyed, + } = tokio_epoll_uring::metrics::global(); + info!(systems_created, systems_destroyed, "tokio-epoll-uring"); + + match procfs::process::Process::myself() { + Ok(myself) => { + match myself.limits() { + Ok(limits) => { + info!(?limits.max_locked_memory, "/proc/self/limits"); + } + Err(error) => { + info!(%error, "no limit stats due to error"); + } + } + + match myself.status() { + Ok(status) => { + let procfs::process::Status { + vmsize, + vmlck, + vmpin, + vmrss, + rssanon, + rssfile, + rssshmem, + vmdata, + vmstk, + vmexe, + vmlib, + vmpte, + threads, + .. + } = status; + info!( + vmsize, + vmlck, + vmpin, + vmrss, + rssanon, + rssfile, + rssshmem, + vmdata, + vmstk, + vmexe, + vmlib, + vmpte, + threads, + "/proc/self/status" + ); + } + Err(error) => { + info!(%error, "no status status due to error"); + } + } + } + Err(error) => { + info!(%error, "no process stats due to error"); + } + }; +} + +#[derive(Clone)] +pub struct Handle(ThreadLocalState); + +impl std::ops::Deref for Handle { + type Target = SystemHandle; + + fn deref(&self) -> &Self::Target { + self.0 + .0 + .cell + .get() + .expect("must be already initialized when using this") + } +} diff --git a/pageserver/src/virtual_file/metadata.rs b/pageserver/src/virtual_file/metadata.rs new file mode 100644 index 0000000000..f530c50988 --- /dev/null +++ b/pageserver/src/virtual_file/metadata.rs @@ -0,0 +1,30 @@ +use std::fs; + +pub enum Metadata { + StdFs(fs::Metadata), + #[cfg(target_os = "linux")] + TokioEpollUring(Box), +} + +#[cfg(target_os = "linux")] +impl From> for Metadata { + fn from(value: Box) -> Self { + Metadata::TokioEpollUring(value) + } +} + +impl From for Metadata { + fn from(value: std::fs::Metadata) -> Self { + Metadata::StdFs(value) + } +} + +impl Metadata { + pub fn len(&self) -> u64 { + match self { + Metadata::StdFs(metadata) => metadata.len(), + #[cfg(target_os = "linux")] + Metadata::TokioEpollUring(statx) => statx.stx_size, + } + } +} diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs new file mode 100644 index 0000000000..7f951270d1 --- /dev/null +++ b/pageserver/src/virtual_file/open_options.rs @@ -0,0 +1,139 @@ +//! Enum-dispatch to the `OpenOptions` type of the respective [`super::IoEngineKind`]; + +use super::io_engine::IoEngine; +use std::{os::fd::OwnedFd, path::Path}; + +#[derive(Debug, Clone)] +pub enum OpenOptions { + StdFs(std::fs::OpenOptions), + #[cfg(target_os = "linux")] + TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions), +} + +impl Default for OpenOptions { + fn default() -> Self { + match super::io_engine::get() { + IoEngine::NotSet => panic!("io engine not set"), + IoEngine::StdFs => Self::StdFs(std::fs::OpenOptions::new()), + #[cfg(target_os = "linux")] + IoEngine::TokioEpollUring => { + Self::TokioEpollUring(tokio_epoll_uring::ops::open_at::OpenOptions::new()) + } + } + } +} + +impl OpenOptions { + pub fn new() -> OpenOptions { + Self::default() + } + + pub fn read(&mut self, read: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.read(read); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.read(read); + } + } + self + } + + pub fn write(&mut self, write: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.write(write); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.write(write); + } + } + self + } + + pub fn create(&mut self, create: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.create(create); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.create(create); + } + } + self + } + + pub fn create_new(&mut self, create_new: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.create_new(create_new); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.create_new(create_new); + } + } + self + } + + pub fn truncate(&mut self, truncate: bool) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.truncate(truncate); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.truncate(truncate); + } + } + self + } + + pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { + match self { + OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()), + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; + system.open(path, x).await.map_err(|e| match e { + tokio_epoll_uring::Error::Op(e) => e, + tokio_epoll_uring::Error::System(system) => { + std::io::Error::new(std::io::ErrorKind::Other, system) + } + }) + } + } + } +} + +impl std::os::unix::prelude::OpenOptionsExt for OpenOptions { + fn mode(&mut self, mode: u32) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.mode(mode); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.mode(mode); + } + } + self + } + + fn custom_flags(&mut self, flags: i32) -> &mut OpenOptions { + match self { + OpenOptions::StdFs(x) => { + let _ = x.custom_flags(flags); + } + #[cfg(target_os = "linux")] + OpenOptions::TokioEpollUring(x) => { + let _ = x.custom_flags(flags); + } + } + self + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs new file mode 100644 index 0000000000..55b1d0b46b --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs @@ -0,0 +1,47 @@ +use crate::{context::RequestContext, virtual_file::owned_buffers_io::write::OwnedAsyncWriter}; +use tokio_epoll_uring::{BoundedBuf, IoBuf}; + +pub struct Writer { + dst: W, + bytes_amount: u64, +} + +impl Writer { + pub fn new(dst: W) -> Self { + Self { + dst, + bytes_amount: 0, + } + } + + pub fn bytes_written(&self) -> u64 { + self.bytes_amount + } + + pub fn as_inner(&self) -> &W { + &self.dst + } + + /// Returns the wrapped `VirtualFile` object as well as the number + /// of bytes that were written to it through this object. + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub fn into_inner(self) -> (u64, W) { + (self.bytes_amount, self.dst) + } +} + +impl OwnedAsyncWriter for Writer +where + W: OwnedAsyncWriter, +{ + #[inline(always)] + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)> { + let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; + self.bytes_amount += u64::try_from(nwritten).unwrap(); + Ok((nwritten, buf)) + } +} diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs new file mode 100644 index 0000000000..885a9221c5 --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -0,0 +1,348 @@ +use bytes::BytesMut; +use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; + +use crate::context::RequestContext; + +/// A trait for doing owned-buffer write IO. +/// Think [`tokio::io::AsyncWrite`] but with owned buffers. +pub trait OwnedAsyncWriter { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + ctx: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)>; +} + +/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch +/// small writes into larger writes of size [`Buffer::cap`]. +/// +/// # Passthrough Of Large Writers +/// +/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`] +/// cause the internal buffer to be flushed prematurely so that the large +/// buffered write is passed through to the underlying [`OwnedAsyncWriter`]. +/// +/// This pass-through is generally beneficial for throughput, but if +/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource, +/// unlimited large writes may cause latency or fairness issues. +/// +/// In such cases, a different implementation that always buffers in memory +/// may be preferable. +pub struct BufferedWriter { + writer: W, + /// invariant: always remains Some(buf) except + /// - while IO is ongoing => goes back to Some() once the IO completed successfully + /// - after an IO error => stays `None` forever + /// In these exceptional cases, it's `None`. + buf: Option, +} + +impl BufferedWriter +where + B: Buffer + Send, + Buf: IoBuf + Send, + W: OwnedAsyncWriter, +{ + pub fn new(writer: W, buf: B) -> Self { + Self { + writer, + buf: Some(buf), + } + } + + pub fn as_inner(&self) -> &W { + &self.writer + } + + /// Panics if used after any of the write paths returned an error + pub fn inspect_buffer(&self) -> &B { + self.buf() + } + + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result { + self.flush(ctx).await?; + + let Self { buf, writer } = self; + assert!(buf.is_some()); + Ok(writer) + } + + #[inline(always)] + fn buf(&self) -> &B { + self.buf + .as_ref() + .expect("must not use after we returned an error") + } + + #[cfg_attr(target_os = "macos", allow(dead_code))] + pub async fn write_buffered( + &mut self, + chunk: Slice, + ctx: &RequestContext, + ) -> std::io::Result<(usize, S)> { + let chunk_len = chunk.len(); + // avoid memcpy for the middle of the chunk + if chunk.len() >= self.buf().cap() { + self.flush(ctx).await?; + // do a big write, bypassing `buf` + assert_eq!( + self.buf + .as_ref() + .expect("must not use after an error") + .pending(), + 0 + ); + let (nwritten, chunk) = self.writer.write_all(chunk, ctx).await?; + assert_eq!(nwritten, chunk_len); + return Ok((nwritten, chunk)); + } + // in-memory copy the < BUFFER_SIZED tail of the chunk + assert!(chunk.len() < self.buf().cap()); + let mut slice = &chunk[..]; + while !slice.is_empty() { + let buf = self.buf.as_mut().expect("must not use after an error"); + let need = buf.cap() - buf.pending(); + let have = slice.len(); + let n = std::cmp::min(need, have); + buf.extend_from_slice(&slice[..n]); + slice = &slice[n..]; + if buf.pending() >= buf.cap() { + assert_eq!(buf.pending(), buf.cap()); + self.flush(ctx).await?; + } + } + assert!(slice.is_empty(), "by now we should have drained the chunk"); + Ok((chunk_len, chunk.into_inner())) + } + + /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data. + /// + /// It is less performant because we always have to copy the borrowed data into the internal buffer + /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant + /// for large writes. + pub async fn write_buffered_borrowed( + &mut self, + mut chunk: &[u8], + ctx: &RequestContext, + ) -> std::io::Result { + let chunk_len = chunk.len(); + while !chunk.is_empty() { + let buf = self.buf.as_mut().expect("must not use after an error"); + let need = buf.cap() - buf.pending(); + let have = chunk.len(); + let n = std::cmp::min(need, have); + buf.extend_from_slice(&chunk[..n]); + chunk = &chunk[n..]; + if buf.pending() >= buf.cap() { + assert_eq!(buf.pending(), buf.cap()); + self.flush(ctx).await?; + } + } + Ok(chunk_len) + } + + async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> { + let buf = self.buf.take().expect("must not use after an error"); + let buf_len = buf.pending(); + if buf_len == 0 { + self.buf = Some(buf); + return Ok(()); + } + let (nwritten, io_buf) = self.writer.write_all(buf.flush(), ctx).await?; + assert_eq!(nwritten, buf_len); + self.buf = Some(Buffer::reuse_after_flush(io_buf)); + Ok(()) + } +} + +/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones. +pub trait Buffer { + type IoBuf: IoBuf; + + /// Capacity of the buffer. Must not change over the lifetime `self`.` + fn cap(&self) -> usize; + + /// Add data to the buffer. + /// Panics if there is not enough room to accomodate `other`'s content, i.e., + /// panics if `other.len() > self.cap() - self.pending()`. + fn extend_from_slice(&mut self, other: &[u8]); + + /// Number of bytes in the buffer. + fn pending(&self) -> usize; + + /// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data + /// so we can use [`tokio_epoll_uring`] to write it to disk. + fn flush(self) -> Slice; + + /// After the write to disk is done and we have gotten back the slice, + /// [`BufferedWriter`] uses this method to re-use the io buffer. + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; +} + +impl Buffer for BytesMut { + type IoBuf = BytesMut; + + #[inline(always)] + fn cap(&self) -> usize { + self.capacity() + } + + fn extend_from_slice(&mut self, other: &[u8]) { + BytesMut::extend_from_slice(self, other) + } + + #[inline(always)] + fn pending(&self) -> usize { + self.len() + } + + fn flush(self) -> Slice { + if self.is_empty() { + return self.slice_full(); + } + let len = self.len(); + self.slice(0..len) + } + + fn reuse_after_flush(mut iobuf: BytesMut) -> Self { + iobuf.clear(); + iobuf + } +} + +impl OwnedAsyncWriter for Vec { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + _: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)> { + let nbytes = buf.bytes_init(); + if nbytes == 0 { + return Ok((0, Slice::into_inner(buf.slice_full()))); + } + let buf = buf.slice(0..nbytes); + self.extend_from_slice(&buf[..]); + Ok((buf.len(), Slice::into_inner(buf))) + } +} + +#[cfg(test)] +mod tests { + use bytes::BytesMut; + + use super::*; + use crate::context::{DownloadBehavior, RequestContext}; + use crate::task_mgr::TaskKind; + + #[derive(Default)] + struct RecorderWriter { + writes: Vec>, + } + impl OwnedAsyncWriter for RecorderWriter { + async fn write_all, Buf: IoBuf + Send>( + &mut self, + buf: B, + _: &RequestContext, + ) -> std::io::Result<(usize, B::Buf)> { + let nbytes = buf.bytes_init(); + if nbytes == 0 { + self.writes.push(vec![]); + return Ok((0, Slice::into_inner(buf.slice_full()))); + } + let buf = buf.slice(0..nbytes); + self.writes.push(Vec::from(&buf[..])); + Ok((buf.len(), Slice::into_inner(buf))) + } + } + + fn test_ctx() -> RequestContext { + RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) + } + + macro_rules! write { + ($writer:ident, $data:literal) => {{ + $writer + .write_buffered(::bytes::Bytes::from_static($data).slice_full(), &test_ctx()) + .await?; + }}; + } + + #[tokio::test] + async fn test_buffered_writes_only() -> std::io::Result<()> { + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + write!(writer, b"a"); + write!(writer, b"b"); + write!(writer, b"c"); + write!(writer, b"d"); + write!(writer, b"e"); + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; + assert_eq!( + recorder.writes, + vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] + ); + Ok(()) + } + + #[tokio::test] + async fn test_passthrough_writes_only() -> std::io::Result<()> { + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + write!(writer, b"abc"); + write!(writer, b"de"); + write!(writer, b""); + write!(writer, b"fghijk"); + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; + assert_eq!( + recorder.writes, + vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] + ); + Ok(()) + } + + #[tokio::test] + async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + write!(writer, b"a"); + write!(writer, b"bc"); + write!(writer, b"d"); + write!(writer, b"e"); + let recorder = writer.flush_and_into_inner(&test_ctx()).await?; + assert_eq!( + recorder.writes, + vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] + ); + Ok(()) + } + + #[tokio::test] + async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { + let ctx = test_ctx(); + let ctx = &ctx; + let recorder = RecorderWriter::default(); + let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + + writer.write_buffered_borrowed(b"abc", ctx).await?; + writer.write_buffered_borrowed(b"d", ctx).await?; + writer.write_buffered_borrowed(b"e", ctx).await?; + writer.write_buffered_borrowed(b"fg", ctx).await?; + writer.write_buffered_borrowed(b"hi", ctx).await?; + writer.write_buffered_borrowed(b"j", ctx).await?; + writer.write_buffered_borrowed(b"klmno", ctx).await?; + + let recorder = writer.flush_and_into_inner(ctx).await?; + assert_eq!( + recorder.writes, + { + let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"]; + expect + } + .iter() + .map(|v| v[..].to_vec()) + .collect::>() + ); + Ok(()) + } +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 539860241d..bb02e97dd7 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -33,12 +33,13 @@ use utils::failpoint_support; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; -use crate::pgdatadir_mapping::*; +use crate::pgdatadir_mapping::{DatadirModification, Version}; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::walrecord::*; use crate::ZERO_PAGE; -use pageserver_api::reltag::{RelTag, SlruKind}; +use pageserver_api::key::rel_block_to_key; +use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; @@ -132,10 +133,14 @@ impl WalIngest { buf.advance(decoded.main_data_offset); assert!(!self.checkpoint_modified); - if self.checkpoint.update_next_xid(decoded.xl_xid) { + if decoded.xl_xid != pg_constants::INVALID_TRANSACTION_ID + && self.checkpoint.update_next_xid(decoded.xl_xid) + { self.checkpoint_modified = true; } + failpoint_support::sleep_millis_async!("wal-ingest-record-sleep"); + #[allow(clippy::if_same_then_else)] match decoded.xl_rmid { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { @@ -271,6 +276,7 @@ impl WalIngest { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, + decoded.origin_id, ctx, ) .await?; @@ -283,6 +289,7 @@ impl WalIngest { modification, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, + decoded.origin_id, ctx, ) .await?; @@ -388,10 +395,19 @@ impl WalIngest { < 0 { self.checkpoint.oldestXid = xlog_checkpoint.oldestXid; - self.checkpoint_modified = true; - } else { - special_treatment_check!(needs none); } + trace!( + "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}", + xlog_checkpoint.oldestActiveXid, + self.checkpoint.oldestActiveXid + ); + self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; + + // Write a new checkpoint key-value pair on every checkpoint record, even + // if nothing really changed. Not strictly required, but it seems nice to + // have some trace of the checkpoint records in the layer files at the same + // LSNs. + self.checkpoint_modified = true; } else if info == pg_constants::XLOG_FPI || info == pg_constants::XLOG_FPI_FOR_HINT { // These records are importan for us, bu they are handled by @@ -423,7 +439,7 @@ impl WalIngest { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_LOGICAL_MESSAGE { - let xlrec = XlLogicalMessage::decode(&mut buf); + let xlrec = crate::walrecord::XlLogicalMessage::decode(&mut buf); let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?; let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size]; if prefix == "neon-test" { @@ -442,7 +458,25 @@ impl WalIngest { special_treatment_check!(unknown record type, pg_version, lsn, decoded); } } - pg_constants::RM_STANDBY_ID => special_treatment_check!(needs none), + pg_constants::RM_STANDBY_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_RUNNING_XACTS { + let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf); + self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid; + } + } + pg_constants::RM_REPLORIGIN_ID => { + let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + if info == pg_constants::XLOG_REPLORIGIN_SET { + let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf); + modification + .set_replorigin(xlrec.node_id, xlrec.remote_lsn) + .await? + } else if info == pg_constants::XLOG_REPLORIGIN_DROP { + let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf); + modification.drop_replorigin(xlrec.node_id).await? + } + } // All of these are handled by the generic ingest_decoded_block function pg_constants::RM_BTREE_ID => special_treatment_check!(needs none), @@ -458,10 +492,6 @@ impl WalIngest { // these records though. pg_constants::RM_COMMIT_TS_ID => special_treatment_check!(needs none), - // These are related to logical replication. I don't know if we should - // do something with them. @knizhnik? - pg_constants::RM_REPLORIGIN_ID => special_treatment_check!(needs none), - _x => special_treatment_check!(unknown record type, pg_version, lsn, decoded), }; @@ -487,7 +517,7 @@ impl WalIngest { ); if !key_is_local { - if self.shard.is_zero() { + if self.shard.is_shard_zero() { // Shard 0 tracks relation sizes. Although we will not store this block, we will observe // its blkno in case it implicitly extends a relation. self.observe_decoded_block(modification, blk, ctx).await?; @@ -1211,7 +1241,7 @@ impl WalIngest { let nblocks = modification .tline - .get_rel_size(src_rel, Version::Modified(modification), true, ctx) + .get_rel_size(src_rel, Version::Modified(modification), ctx) .await?; let dst_rel = RelTag { spcnode: tablespace_id, @@ -1225,17 +1255,27 @@ impl WalIngest { // Copy content debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks); for blknum in 0..nblocks { - debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); + // Sharding: + // - src and dst are always on the same shard, because they differ only by dbNode, and + // dbNode is not included in the hash inputs for sharding. + // - This WAL command is replayed on all shards, but each shard only copies the blocks + // that belong to it. + let src_key = rel_block_to_key(src_rel, blknum); + if !self.shard.is_key_local(&src_key) { + debug!( + "Skipping non-local key {} during XLOG_DBASE_CREATE", + src_key + ); + continue; + } + debug!( + "copying block {} from {} ({}) to {}", + blknum, src_rel, src_key, dst_rel + ); let content = modification .tline - .get_rel_page_at_lsn( - src_rel, - blknum, - Version::Modified(modification), - true, - ctx, - ) + .get_rel_page_at_lsn(src_rel, blknum, Version::Modified(modification), ctx) .await?; modification.put_rel_page_image(dst_rel, blknum, content)?; num_blocks_copied += 1; @@ -1345,6 +1385,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, parsed: &XlXactParsedRecord, is_commit: bool, + origin_id: u16, ctx: &RequestContext, ) -> anyhow::Result<()> { // Record update of CLOG pages @@ -1403,13 +1444,18 @@ impl WalIngest { }; if modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { self.put_rel_drop(modification, rel, ctx).await?; } } } + if origin_id != 0 { + modification + .set_replorigin(origin_id, parsed.origin_lsn) + .await?; + } Ok(()) } @@ -1539,16 +1585,22 @@ impl WalIngest { self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers; self.checkpoint_modified = true; } - let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| { - if mbr.xid.wrapping_sub(acc) as i32 > 0 { - mbr.xid + let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| { + if let Some(max_xid) = acc { + if mbr.xid.wrapping_sub(max_xid) as i32 > 0 { + Some(mbr.xid) + } else { + acc + } } else { - acc + Some(mbr.xid) } }); - if self.checkpoint.update_next_xid(max_mbr_xid) { - self.checkpoint_modified = true; + if let Some(max_xid) = max_mbr_xid { + if self.checkpoint.update_next_xid(max_xid) { + self.checkpoint_modified = true; + } } Ok(()) } @@ -1696,7 +1748,7 @@ impl WalIngest { nblocks } else if !modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { // create it with 0 size initially, the logic below will extend it @@ -1708,7 +1760,7 @@ impl WalIngest { } else { modification .tline - .get_rel_size(rel, Version::Modified(modification), true, ctx) + .get_rel_size(rel, Version::Modified(modification), ctx) .await? }; @@ -1805,14 +1857,14 @@ async fn get_relsize( ) -> anyhow::Result { let nblocks = if !modification .tline - .get_rel_exists(rel, Version::Modified(modification), true, ctx) + .get_rel_exists(rel, Version::Modified(modification), ctx) .await? { 0 } else { modification .tline - .get_rel_size(rel, Version::Modified(modification), true, ctx) + .get_rel_size(rel, Version::Modified(modification), ctx) .await? }; Ok(nblocks) @@ -1824,8 +1876,6 @@ mod tests { use super::*; use crate::tenant::harness::*; use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH}; - use crate::tenant::Timeline; - use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT; use postgres_ffi::RELSEG_SIZE; use crate::DEFAULT_PG_VERSION; @@ -1865,22 +1915,22 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?; walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x30)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx) .await?; m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x40)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx) .await?; m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x50)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx) .await?; m.commit(&ctx).await?; @@ -1889,29 +1939,29 @@ mod tests { // The relation was created at LSN 2, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, 1 ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, 3 ); @@ -1919,48 +1969,48 @@ mod tests { // Check page contents at each LSN assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), &ctx) .await?, - TEST_IMG("foo blk 0 at 2") + test_img("foo blk 0 at 2") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), &ctx) .await?, - TEST_IMG("foo blk 1 at 4") + test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG("foo blk 1 at 4") + test_img("foo blk 1 at 4") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG("foo blk 2 at 5") + test_img("foo blk 2 at 5") ); // Truncate last block @@ -1974,35 +2024,35 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), &ctx) .await?, - TEST_IMG("foo blk 0 at 3") + test_img("foo blk 0 at 3") ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), &ctx) .await?, - TEST_IMG("foo blk 1 at 4") + test_img("foo blk 1 at 4") ); // should still see the truncated block with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, 3 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG("foo blk 2 at 5") + test_img("foo blk 2 at 5") ); // Truncate to zero length @@ -2013,7 +2063,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx) .await?, 0 ); @@ -2021,53 +2071,53 @@ mod tests { // Extend from 0 to 2 blocks, leaving a gap let mut m = tline.begin_modification(Lsn(0x70)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx) .await?; m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx) .await?, 2 ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), &ctx) .await?, ZERO_PAGE ); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), &ctx) .await?, - TEST_IMG("foo blk 1") + test_img("foo blk 1") ); // Extend a lot more, leaving a big gap that spans across segments let mut m = tline.begin_modification(Lsn(0x80)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx) .await?; m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, 1501 ); for blk in 2..1500 { assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), &ctx) .await?, ZERO_PAGE ); } assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), &ctx) .await?, - TEST_IMG("foo blk 1500") + test_img("foo blk 1500") ); Ok(()) @@ -2085,20 +2135,20 @@ mod tests { let mut m = tline.begin_modification(Lsn(0x20)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; m.commit(&ctx).await?; // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, 1 ); @@ -2111,7 +2161,7 @@ mod tests { // Check that rel is not visible anymore assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx) .await?, false ); @@ -2122,20 +2172,20 @@ mod tests { // Re-create it let mut m = tline.begin_modification(Lsn(0x40)); walingest - .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx) .await?; m.commit(&ctx).await?; // Check that rel exists and size is correct assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx) .await?, 1 ); @@ -2160,7 +2210,7 @@ mod tests { for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); walingest - .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; } m.commit(&ctx).await?; @@ -2168,24 +2218,24 @@ mod tests { // The relation was created at LSN 20, not visible at LSN 1 yet. assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await?, false ); assert!(tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx) .await .is_err()); assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx) .await?, relsize ); @@ -2196,9 +2246,9 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2213,7 +2263,7 @@ mod tests { // Check reported size and contents after truncation assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx) .await?, 1 ); @@ -2223,16 +2273,16 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } // should still see all blocks with older LSN assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx) .await?, relsize ); @@ -2241,9 +2291,9 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2254,20 +2304,20 @@ mod tests { for blkno in 0..relsize { let data = format!("foo blk {} at {}", blkno, lsn); walingest - .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx) + .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; } m.commit(&ctx).await?; assert_eq!( tline - .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, true ); assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx) .await?, relsize ); @@ -2277,9 +2327,9 @@ mod tests { let data = format!("foo blk {} at {}", blkno, lsn); assert_eq!( tline - .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx) + .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), &ctx) .await?, - TEST_IMG(&data) + test_img(&data) ); } @@ -2300,7 +2350,7 @@ mod tests { for blknum in 0..RELSEG_SIZE + 1 { lsn += 0x10; let mut m = tline.begin_modification(Lsn(lsn)); - let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); + let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn))); walingest .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx) .await?; @@ -2311,7 +2361,7 @@ mod tests { assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE + 1 ); @@ -2325,7 +2375,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE ); @@ -2340,7 +2390,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, RELSEG_SIZE - 1 ); @@ -2358,7 +2408,7 @@ mod tests { m.commit(&ctx).await?; assert_eq!( tline - .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx) + .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx) .await?, size as BlockNumber ); @@ -2401,7 +2451,8 @@ mod tests { let harness = TenantHarness::create("test_ingest_real_wal").unwrap(); let (tenant, ctx) = harness.load().await; - let remote_initdb_path = remote_initdb_archive_path(&tenant.tenant_id(), &TIMELINE_ID); + let remote_initdb_path = + remote_initdb_archive_path(&tenant.tenant_shard_id().tenant_id, &TIMELINE_ID); let initdb_path = harness.remote_fs_dir.join(remote_initdb_path.get_path()); std::fs::create_dir_all(initdb_path.parent().unwrap()) diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index ff6bc9194b..62a3a91b0b 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -9,10 +9,10 @@ use postgres_ffi::pg_constants; use postgres_ffi::BLCKSZ; use postgres_ffi::{BlockNumber, TimestampTz}; use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; -use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; +use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; use tracing::*; -use utils::bin_ser::DeserializeError; +use utils::{bin_ser::DeserializeError, lsn::Lsn}; /// Each update to a page is represented by a NeonWalRecord. It can be a wrapper /// around a PostgreSQL WAL record, or a custom neon-specific "record". @@ -44,19 +44,66 @@ pub enum NeonWalRecord { moff: MultiXactOffset, members: Vec, }, + /// Update the map of AUX files, either writing or dropping an entry + AuxFile { + file_path: String, + content: Option, + }, + + /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it. + #[cfg(test)] + Test { + /// Append a string to the image. + append: String, + /// Clear the image before appending. + clear: bool, + /// Treat this record as an init record. `clear` should be set to true if this field is set + /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and + /// its references in `timeline.rs`. + will_init: bool, + }, } impl NeonWalRecord { /// Does replaying this WAL record initialize the page from scratch, or does /// it need to be applied over the previous image of the page? pub fn will_init(&self) -> bool { + // If you change this function, you'll also need to change ValueBytes::will_init match self { NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, - + #[cfg(test)] + NeonWalRecord::Test { will_init, .. } => *will_init, // None of the special neon record types currently initialize the page _ => false, } } + + #[cfg(test)] + pub(crate) fn wal_append(s: impl AsRef) -> Self { + Self::Test { + append: s.as_ref().to_string(), + clear: false, + will_init: false, + } + } + + #[cfg(test)] + pub(crate) fn wal_clear() -> Self { + Self::Test { + append: "".to_string(), + clear: true, + will_init: false, + } + } + + #[cfg(test)] + pub(crate) fn wal_init() -> Self { + Self::Test { + append: "".to_string(), + clear: true, + will_init: true, + } + } } /// DecodedBkpBlock represents per-page data contained in a WAL record. @@ -110,6 +157,7 @@ pub struct DecodedWALRecord { pub blocks: Vec, pub main_data_offset: usize, + pub origin_id: u16, } #[repr(C)] @@ -567,6 +615,7 @@ pub struct XlXactParsedRecord { pub subxacts: Vec, pub xnodes: Vec, + pub origin_lsn: Lsn, } impl XlXactParsedRecord { @@ -645,6 +694,11 @@ impl XlXactParsedRecord { debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid); } + let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 { + Lsn(buf.get_u64_le()) + } else { + Lsn::INVALID + }; XlXactParsedRecord { xid, info, @@ -654,6 +708,7 @@ impl XlXactParsedRecord { ts_id, subxacts, xnodes, + origin_lsn, } } } @@ -768,6 +823,72 @@ impl XlLogicalMessage { } } +#[repr(C)] +#[derive(Debug)] +pub struct XlRunningXacts { + pub xcnt: u32, + pub subxcnt: u32, + pub subxid_overflow: bool, + pub next_xid: TransactionId, + pub oldest_running_xid: TransactionId, + pub latest_completed_xid: TransactionId, + pub xids: Vec, +} + +impl XlRunningXacts { + pub fn decode(buf: &mut Bytes) -> XlRunningXacts { + let xcnt = buf.get_u32_le(); + let subxcnt = buf.get_u32_le(); + let subxid_overflow = buf.get_u32_le() != 0; + let next_xid = buf.get_u32_le(); + let oldest_running_xid = buf.get_u32_le(); + let latest_completed_xid = buf.get_u32_le(); + let mut xids = Vec::new(); + for _ in 0..(xcnt + subxcnt) { + xids.push(buf.get_u32_le()); + } + XlRunningXacts { + xcnt, + subxcnt, + subxid_overflow, + next_xid, + oldest_running_xid, + latest_completed_xid, + xids, + } + } +} + +#[repr(C)] +#[derive(Debug)] +pub struct XlReploriginDrop { + pub node_id: RepOriginId, +} + +impl XlReploriginDrop { + pub fn decode(buf: &mut Bytes) -> XlReploriginDrop { + XlReploriginDrop { + node_id: buf.get_u16_le(), + } + } +} + +#[repr(C)] +#[derive(Debug)] +pub struct XlReploriginSet { + pub remote_lsn: Lsn, + pub node_id: RepOriginId, +} + +impl XlReploriginSet { + pub fn decode(buf: &mut Bytes) -> XlReploriginSet { + XlReploriginSet { + remote_lsn: Lsn(buf.get_u64_le()), + node_id: buf.get_u16_le(), + } + } +} + /// Main routine to decode a WAL record and figure out which blocks are modified // // See xlogrecord.h for details @@ -802,6 +923,7 @@ pub fn decode_wal_record( let mut rnode_dbnode: u32 = 0; let mut rnode_relnode: u32 = 0; let mut got_rnode = false; + let mut origin_id: u16 = 0; let mut buf = record.clone(); @@ -849,7 +971,7 @@ pub fn decode_wal_record( pg_constants::XLR_BLOCK_ID_ORIGIN => { // RepOriginId is uint16 - buf.advance(2); + origin_id = buf.get_u16_le(); } pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => { @@ -1046,6 +1168,7 @@ pub fn decode_wal_record( decoded.xl_info = xlogrec.xl_info; decoded.xl_rmid = xlogrec.xl_rmid; decoded.record = record; + decoded.origin_id = origin_id; decoded.main_data_offset = main_data_offset; Ok(()) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 6918698f29..d562540bde 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -17,70 +17,30 @@ //! records. It achieves it by dropping privileges before replaying //! any WAL records, so that even if an attacker hijacks the Postgres //! process, he cannot escape out of it. -//! -use anyhow::Context; -use byteorder::{ByteOrder, LittleEndian}; -use bytes::{BufMut, Bytes, BytesMut}; -use nix::poll::*; -use pageserver_api::shard::TenantShardId; -use serde::Serialize; -use std::collections::VecDeque; -use std::io; -use std::io::prelude::*; -use std::ops::{Deref, DerefMut}; -use std::os::unix::io::AsRawFd; -use std::os::unix::prelude::CommandExt; -use std::process::Stdio; -use std::process::{Child, ChildStdin, ChildStdout, Command}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock}; -use std::time::Duration; -use std::time::Instant; -use tracing::*; -use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock}; -#[cfg(feature = "testing")] -use std::sync::atomic::{AtomicUsize, Ordering}; +/// Process lifecycle and abstracction for the IPC protocol. +mod process; + +/// Code to apply [`NeonWalRecord`]s. +pub(crate) mod apply_neon; use crate::config::PageServerConf; use crate::metrics::{ - WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS, - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, - WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, + WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, + WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, }; -use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::repository::Key; use crate::walrecord::NeonWalRecord; -use pageserver_api::reltag::{RelTag, SlruKind}; -use postgres_ffi::pg_constants; -use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; -use postgres_ffi::v14::nonrelfile_utils::{ - mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, - transaction_id_set_status, -}; -use postgres_ffi::BLCKSZ; - -/// -/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. -/// -/// In Postgres `BufferTag` structure is used for exactly the same purpose. -/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91). -/// -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)] -pub(crate) struct BufferTag { - pub rel: RelTag, - pub blknum: u32, -} - -struct ProcessInput { - stdin: ChildStdin, - n_requests: usize, -} - -struct ProcessOutput { - stdout: ChildStdout, - pending_responses: VecDeque>, - n_processed_responses: usize, -} +use anyhow::Context; +use bytes::{Bytes, BytesMut}; +use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; +use pageserver_api::shard::TenantShardId; +use std::sync::Arc; +use std::time::Duration; +use std::time::Instant; +use tracing::*; +use utils::lsn::Lsn; +use utils::sync::heavier_once_cell; /// /// This is the real implementation that uses a Postgres process to @@ -93,22 +53,19 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - redo_process: RwLock>>, -} - -/// Can this request be served by neon redo functions -/// or we need to pass it to wal-redo postgres process? -fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { - // Currently, we don't have bespoken Rust code to replay any - // Postgres WAL records. But everything else is handled in neon. - #[allow(clippy::match_like_matches_macro)] - match rec { - NeonWalRecord::Postgres { - will_init: _, - rec: _, - } => false, - _ => true, - } + /// The current [`process::WalRedoProcess`] that is used by new redo requests. + /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo + /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the + /// their process object; we use [`Arc::clone`] for that. + /// This is primarily because earlier implementations that didn't use [`heavier_once_cell`] + /// had that behavior; it's probably unnecessary. + /// The only merit of it is that if one walredo process encounters an error, + /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`]. + /// and retry redo, thereby starting the new process, while other redo tasks might + /// still be using the old redo process. But, those other tasks will most likely + /// encounter an error as well, and errors are an unexpected condition anyway. + /// So, probably we could get rid of the `Arc` in the future. + redo_process: heavier_once_cell::OnceCell>, } /// @@ -138,10 +95,10 @@ impl PostgresRedoManager { let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); let mut img = base_img.map(|p| p.1); - let mut batch_neon = can_apply_in_neon(&records[0].1); + let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1); let mut batch_start = 0; for (i, record) in records.iter().enumerate().skip(1) { - let rec_neon = can_apply_in_neon(&record.1); + let rec_neon = apply_neon::can_apply_in_neon(&record.1); if rec_neon != batch_neon { let result = if batch_neon { @@ -156,6 +113,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await }; img = Some(result?); @@ -176,6 +134,24 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await + } + } + + pub fn status(&self) -> WalRedoManagerStatus { + WalRedoManagerStatus { + last_redo_at: { + let at = *self.last_redo_at.lock().unwrap(); + at.and_then(|at| { + let age = at.elapsed(); + // map any chrono errors silently to None here + chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) + }) + }, + process: self + .redo_process + .get() + .map(|p| WalRedoManagerProcessStatus { pid: p.id() }), } } } @@ -193,7 +169,7 @@ impl PostgresRedoManager { tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), - redo_process: RwLock::new(None), + redo_process: heavier_once_cell::OnceCell::default(), } } @@ -205,8 +181,7 @@ impl PostgresRedoManager { if let Some(last_redo_at) = *g { if last_redo_at.elapsed() >= idle_timeout { drop(g); - let mut guard = self.redo_process.write().unwrap(); - *guard = None; + drop(self.redo_process.get().map(|guard| guard.take_and_deinit())); } } } @@ -215,8 +190,11 @@ impl PostgresRedoManager { /// /// Process one request for WAL redo using wal-redo postgres /// + /// # Cancel-Safety + /// + /// Cancellation safe. #[allow(clippy::too_many_arguments)] - fn apply_batch_postgres( + async fn apply_batch_postgres( &self, key: Key, lsn: Lsn, @@ -228,47 +206,42 @@ impl PostgresRedoManager { ) -> anyhow::Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); - let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; + let (rel, blknum) = key.to_rel_block().context("invalid record")?; const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - // launch the WAL redo process on first use - let proc: Arc = { - let proc_guard = self.redo_process.read().unwrap(); - match &*proc_guard { - None => { - // "upgrade" to write lock to launch the process - drop(proc_guard); - let mut proc_guard = self.redo_process.write().unwrap(); - match &*proc_guard { - None => { - let timer = - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer(); - let proc = Arc::new( - WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - ); - timer.observe_duration(); - *proc_guard = Some(Arc::clone(&proc)); - proc - } - Some(proc) => Arc::clone(proc), - } + let proc: Arc = + match self.redo_process.get_or_init_detached().await { + Ok(guard) => Arc::clone(&guard), + Err(permit) => { + // don't hold poison_guard, the launch code can bail + let start = Instant::now(); + let proc = Arc::new( + process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + ); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process.set(Arc::clone(&proc), permit); + proc } - Some(proc) => Arc::clone(proc), - } - }; + }; let started_at = std::time::Instant::now(); // Relational WAL records are applied using wal-redo-postgres - let buf_tag = BufferTag { rel, blknum }; let result = proc - .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout) + .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout) + .await .context("apply_wal_records"); let duration = started_at.elapsed(); @@ -298,7 +271,7 @@ impl PostgresRedoManager { // next request will launch a new one. if let Err(e) = result.as_ref() { error!( - "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", + "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", records.len(), records.first().map(|p| p.0).unwrap_or(Lsn(0)), records.last().map(|p| p.0).unwrap_or(Lsn(0)), @@ -308,34 +281,34 @@ impl PostgresRedoManager { n_attempts, e, ); - // Avoid concurrent callers hitting the same issue. - // We can't prevent it from happening because we want to enable parallelism. - { - let mut guard = self.redo_process.write().unwrap(); - match &*guard { - Some(current_field_value) => { - if Arc::ptr_eq(current_field_value, &proc) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - *guard = None; - } - } - None => { - // Another thread was faster to observe the error, and already took the process out of rotation. - } - } - } + // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. + // Note that there may be other tasks concurrent with us that also hold `proc`. + // We have to deal with that here. + // Also read the doc comment on field `self.redo_process`. + // // NB: there may still be other concurrent threads using `proc`. // The last one will send SIGKILL when the underlying Arc reaches refcount 0. - // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep - // holding the lock while waiting for the process to exit. - // NB: the drop impl blocks the current threads with a wait() system call for - // the child process. We dropped the `guard` above so that other threads aren't - // affected. But, it's good that the current thread _does_ block to wait. - // If we instead deferred the waiting into the background / to tokio, it could - // happen that if walredo always fails immediately, we spawn processes faster + // + // NB: the drop impl blocks the dropping thread with a wait() system call for + // the child process. In some ways the blocking is actually good: if we + // deferred the waiting into the background / to tokio if we used `tokio::process`, + // it could happen that if walredo always fails immediately, we spawn processes faster // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. // This probably needs revisiting at some later point. + match self.redo_process.get() { + None => (), + Some(guard) => { + if Arc::ptr_eq(&proc, &*guard) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } + } + // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. drop(proc); } else if n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); @@ -392,768 +365,15 @@ impl PostgresRedoManager { &self, key: Key, page: &mut BytesMut, - _record_lsn: Lsn, + record_lsn: Lsn, record: &NeonWalRecord, ) -> anyhow::Result<()> { - match record { - NeonWalRecord::Postgres { - will_init: _, - rec: _, - } => { - anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); - } - NeonWalRecord::ClearVisibilityMapFlags { - new_heap_blkno, - old_heap_blkno, - flags, - } => { - // sanity check that this is modifying the correct relation - let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; - assert!( - rel.forknum == VISIBILITYMAP_FORKNUM, - "ClearVisibilityMapFlags record on unexpected rel {}", - rel - ); - if let Some(heap_blkno) = *new_heap_blkno { - // Calculate the VM block and offset that corresponds to the heap block. - let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); - let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); - let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); - - // Check that we're modifying the correct VM block. - assert!(map_block == blknum); - - // equivalent to PageGetContents(page) - let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; - - map[map_byte as usize] &= !(flags << map_offset); - } - - // Repeat for 'old_heap_blkno', if any - if let Some(heap_blkno) = *old_heap_blkno { - let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); - let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); - let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); - - assert!(map_block == blknum); - - let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; - - map[map_byte as usize] &= !(flags << map_offset); - } - } - // Non-relational WAL records are handled here, with custom code that has the - // same effects as the corresponding Postgres WAL redo function. - NeonWalRecord::ClogSetCommitted { xids, timestamp } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::Clog, - "ClogSetCommitted record with unexpected key {}", - key - ); - for &xid in xids { - let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - - // Check that we're modifying the correct CLOG block. - assert!( - segno == expected_segno, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key - ); - assert!( - blknum == expected_blknum, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key - ); - - transaction_id_set_status( - xid, - pg_constants::TRANSACTION_STATUS_COMMITTED, - page, - ); - } - - // Append the timestamp - if page.len() == BLCKSZ as usize + 8 { - page.truncate(BLCKSZ as usize); - } - if page.len() == BLCKSZ as usize { - page.extend_from_slice(×tamp.to_be_bytes()); - } else { - warn!( - "CLOG blk {} in seg {} has invalid size {}", - blknum, - segno, - page.len() - ); - } - } - NeonWalRecord::ClogSetAborted { xids } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::Clog, - "ClogSetAborted record with unexpected key {}", - key - ); - for &xid in xids { - let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - - // Check that we're modifying the correct CLOG block. - assert!( - segno == expected_segno, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key - ); - assert!( - blknum == expected_blknum, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key - ); - - transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); - } - } - NeonWalRecord::MultixactOffsetCreate { mid, moff } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::MultiXactOffsets, - "MultixactOffsetCreate record with unexpected key {}", - key - ); - // Compute the block and offset to modify. - // See RecordNewMultiXact in PostgreSQL sources. - let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; - let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; - let offset = (entryno * 4) as usize; - - // Check that we're modifying the correct multixact-offsets block. - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - assert!( - segno == expected_segno, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key - ); - assert!( - blknum == expected_blknum, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key - ); - - LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); - } - NeonWalRecord::MultixactMembersCreate { moff, members } => { - let (slru_kind, segno, blknum) = - key_to_slru_block(key).context("invalid record")?; - assert_eq!( - slru_kind, - SlruKind::MultiXactMembers, - "MultixactMembersCreate record with unexpected key {}", - key - ); - for (i, member) in members.iter().enumerate() { - let offset = moff + i as u32; - - // Compute the block and offset to modify. - // See RecordNewMultiXact in PostgreSQL sources. - let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32; - let memberoff = mx_offset_to_member_offset(offset); - let flagsoff = mx_offset_to_flags_offset(offset); - let bshift = mx_offset_to_flags_bitshift(offset); - - // Check that we're modifying the correct multixact-members block. - let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; - let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - assert!( - segno == expected_segno, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key - ); - assert!( - blknum == expected_blknum, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key - ); - - let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); - flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); - flagsval |= member.status << bshift; - LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval); - LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid); - } - } - } + apply_neon::apply_in_neon(record, record_lsn, key, page)?; Ok(()) } } -/// -/// Command with ability not to give all file descriptors to child process -/// -trait CloseFileDescriptors: CommandExt { - /// - /// Close file descriptors (other than stdin, stdout, stderr) in child process - /// - fn close_fds(&mut self) -> &mut Command; -} - -impl CloseFileDescriptors for C { - fn close_fds(&mut self) -> &mut Command { - // SAFETY: Code executed inside pre_exec should have async-signal-safety, - // which means it should be safe to execute inside a signal handler. - // The precise meaning depends on platform. See `man signal-safety` - // for the linux definition. - // - // The set_fds_cloexec_threadsafe function is documented to be - // async-signal-safe. - // - // Aside from this function, the rest of the code is re-entrant and - // doesn't make any syscalls. We're just passing constants. - // - // NOTE: It's easy to indirectly cause a malloc or lock a mutex, - // which is not async-signal-safe. Be careful. - unsafe { - self.pre_exec(move || { - close_fds::set_fds_cloexec_threadsafe(3, &[]); - Ok(()) - }) - } - } -} - -struct WalRedoProcess { - #[allow(dead_code)] - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - // Some() on construction, only becomes None on Drop. - child: Option, - stdout: Mutex, - stdin: Mutex, - /// Counter to separate same sized walredo inputs failing at the same millisecond. - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize, -} - -impl WalRedoProcess { - // - // Start postgres binary in special WAL redo mode. - // - #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))] - fn launch( - conf: &'static PageServerConf, - tenant_shard_id: TenantShardId, - pg_version: u32, - ) -> anyhow::Result { - let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. - let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; - - // Start postgres itself - let child = Command::new(pg_bin_dir_path.join("postgres")) - .arg("--wal-redo") - .stdin(Stdio::piped()) - .stderr(Stdio::piped()) - .stdout(Stdio::piped()) - .env_clear() - .env("LD_LIBRARY_PATH", &pg_lib_dir_path) - .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) - // The redo process is not trusted, and runs in seccomp mode that - // doesn't allow it to open any files. We have to also make sure it - // doesn't inherit any file descriptors from the pageserver, that - // would allow an attacker to read any files that happen to be open - // in the pageserver. - // - // The Rust standard library makes sure to mark any file descriptors with - // as close-on-exec by default, but that's not enough, since we use - // libraries that directly call libc open without setting that flag. - .close_fds() - .spawn_no_leak_child(tenant_shard_id) - .context("spawn process")?; - WAL_REDO_PROCESS_COUNTERS.started.inc(); - let mut child = scopeguard::guard(child, |child| { - error!("killing wal-redo-postgres process due to a problem during launch"); - child.kill_and_wait(WalRedoKillCause::Startup); - }); - - let stdin = child.stdin.take().unwrap(); - let stdout = child.stdout.take().unwrap(); - let stderr = child.stderr.take().unwrap(); - let stderr = tokio::process::ChildStderr::from_std(stderr) - .context("convert to tokio::ChildStderr")?; - macro_rules! set_nonblock_or_log_err { - ($file:ident) => {{ - let res = set_nonblock($file.as_raw_fd()); - if let Err(e) = &res { - error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); - } - res - }}; - } - set_nonblock_or_log_err!(stdin)?; - set_nonblock_or_log_err!(stdout)?; - - // all fallible operations post-spawn are complete, so get rid of the guard - let child = scopeguard::ScopeGuard::into_inner(child); - - tokio::spawn( - async move { - scopeguard::defer! { - debug!("wal-redo-postgres stderr_logger_task finished"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); - } - debug!("wal-redo-postgres stderr_logger_task started"); - crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); - - use tokio::io::AsyncBufReadExt; - let mut stderr_lines = tokio::io::BufReader::new(stderr); - let mut buf = Vec::new(); - let res = loop { - buf.clear(); - // TODO we don't trust the process to cap its stderr length. - // Currently it can do unbounded Vec allocation. - match stderr_lines.read_until(b'\n', &mut buf).await { - Ok(0) => break Ok(()), // eof - Ok(num_bytes) => { - let output = String::from_utf8_lossy(&buf[..num_bytes]); - error!(%output, "received output"); - } - Err(e) => { - break Err(e); - } - } - }; - match res { - Ok(()) => (), - Err(e) => { - error!(error=?e, "failed to read from walredo stderr"); - } - } - }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) - ); - - Ok(Self { - conf, - tenant_shard_id, - child: Some(child), - stdin: Mutex::new(ProcessInput { - stdin, - n_requests: 0, - }), - stdout: Mutex::new(ProcessOutput { - stdout, - pending_responses: VecDeque::new(), - n_processed_responses: 0, - }), - #[cfg(feature = "testing")] - dump_sequence: AtomicUsize::default(), - }) - } - - fn id(&self) -> u32 { - self.child - .as_ref() - .expect("must not call this during Drop") - .id() - } - - // Apply given WAL records ('records') over an old page image. Returns - // new page image. - // - #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))] - fn apply_wal_records( - &self, - tag: BufferTag, - base_img: &Option, - records: &[(Lsn, NeonWalRecord)], - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let input = self.stdin.lock().unwrap(); - - // Serialize all the messages to send the WAL redo process first. - // - // This could be problematic if there are millions of records to replay, - // but in practice the number of records is usually so small that it doesn't - // matter, and it's better to keep this code simple. - // - // Most requests start with a before-image with BLCKSZ bytes, followed by - // by some other WAL records. Start with a buffer that can hold that - // comfortably. - let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); - build_begin_redo_for_block_msg(tag, &mut writebuf); - if let Some(img) = base_img { - build_push_page_msg(tag, img, &mut writebuf); - } - for (lsn, rec) in records.iter() { - if let NeonWalRecord::Postgres { - will_init: _, - rec: postgres_rec, - } = rec - { - build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); - } else { - anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); - } - } - build_get_page_msg(tag, &mut writebuf); - WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); - - let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout); - - if res.is_err() { - // not all of these can be caused by this particular input, however these are so rare - // in tests so capture all. - self.record_and_log(&writebuf); - } - - res - } - - fn apply_wal_records0( - &self, - writebuf: &[u8], - input: MutexGuard, - wal_redo_timeout: Duration, - ) -> anyhow::Result { - let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small. - let mut nwrite = 0usize; - - let mut stdin_pollfds = [PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT)]; - - while nwrite < writebuf.len() { - let n = loop { - match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If 'stdin' is writeable, do write. - let in_revents = stdin_pollfds[0].revents().unwrap(); - if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() { - nwrite += proc.stdin.write(&writebuf[nwrite..])?; - } - if in_revents.contains(PollFlags::POLLHUP) { - // We still have more data to write, but the process closed the pipe. - anyhow::bail!("WAL redo process closed its stdin unexpectedly"); - } - } - let request_no = proc.n_requests; - proc.n_requests += 1; - drop(proc); - - // To improve walredo performance we separate sending requests and receiving - // responses. Them are protected by different mutexes (output and input). - // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process - // then there is not warranty that T1 will first granted output mutex lock. - // To address this issue we maintain number of sent requests, number of processed - // responses and ring buffer with pending responses. After sending response - // (under input mutex), threads remembers request number. Then it releases - // input mutex, locks output mutex and fetch in ring buffer all responses until - // its stored request number. The it takes correspondent element from - // pending responses ring buffer and truncate all empty elements from the front, - // advancing processed responses number. - - let mut output = self.stdout.lock().unwrap(); - let mut stdout_pollfds = [PollFd::new(output.stdout.as_raw_fd(), PollFlags::POLLIN)]; - let n_processed_responses = output.n_processed_responses; - while n_processed_responses + output.pending_responses.len() <= request_no { - // We expect the WAL redo process to respond with an 8k page image. We read it - // into this buffer. - let mut resultbuf = vec![0; BLCKSZ.into()]; - let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far - while nresult < BLCKSZ.into() { - // We do two things simultaneously: reading response from stdout - // and forward any logging information that the child writes to its stderr to the page server's log. - let n = loop { - match nix::poll::poll( - &mut stdout_pollfds[..], - wal_redo_timeout.as_millis() as i32, - ) { - Err(nix::errno::Errno::EINTR) => continue, - res => break res, - } - }?; - - if n == 0 { - anyhow::bail!("WAL redo timed out"); - } - - // If we have some data in stdout, read it to the result buffer. - let out_revents = stdout_pollfds[0].revents().unwrap(); - if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { - nresult += output.stdout.read(&mut resultbuf[nresult..])?; - } - if out_revents.contains(PollFlags::POLLHUP) { - anyhow::bail!("WAL redo process closed its stdout unexpectedly"); - } - } - output - .pending_responses - .push_back(Some(Bytes::from(resultbuf))); - } - // Replace our request's response with None in `pending_responses`. - // Then make space in the ring buffer by clearing out any seqence of contiguous - // `None`'s from the front of `pending_responses`. - // NB: We can't pop_front() because other requests' responses because another - // requester might have grabbed the output mutex before us: - // T1: grab input mutex - // T1: send request_no 23 - // T1: release input mutex - // T2: grab input mutex - // T2: send request_no 24 - // T2: release input mutex - // T2: grab output mutex - // T2: n_processed_responses + output.pending_responses.len() <= request_no - // 23 0 24 - // T2: enters poll loop that reads stdout - // T2: put response for 23 into pending_responses - // T2: put response for 24 into pending_resposnes - // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back - // T2: takes its response_24 - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Some(response_23) None Back - // T2: releases output mutex - // T1: grabs output mutex - // T1: n_processed_responses + output.pending_responses.len() > request_no - // 23 2 23 - // T1: skips poll loop that reads stdout - // T1: takes its response_23 - // pending_responses now looks like this: Front None None Back - // T2: does the while loop below - // pending_responses now looks like this: Front Back - // n_processed_responses now has value 25 - let res = output.pending_responses[request_no - n_processed_responses] - .take() - .expect("we own this request_no, nobody else is supposed to take it"); - while let Some(front) = output.pending_responses.front() { - if front.is_none() { - output.pending_responses.pop_front(); - output.n_processed_responses += 1; - } else { - break; - } - } - Ok(res) - } - - #[cfg(feature = "testing")] - fn record_and_log(&self, writebuf: &[u8]) { - let millis = std::time::SystemTime::now() - .duration_since(std::time::SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis(); - - let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); - - // these files will be collected to an allure report - let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); - - let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); - - let res = std::fs::OpenOptions::new() - .write(true) - .create_new(true) - .read(true) - .open(path) - .and_then(|mut f| f.write_all(writebuf)); - - // trip up allowed_errors - if let Err(e) = res { - tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); - } else { - tracing::error!(filename, "erroring walredo input saved"); - } - } - - #[cfg(not(feature = "testing"))] - fn record_and_log(&self, _: &[u8]) {} -} - -impl Drop for WalRedoProcess { - fn drop(&mut self) { - self.child - .take() - .expect("we only do this once") - .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); - // no way to wait for stderr_logger_task from Drop because that is async only - } -} - -/// Wrapper type around `std::process::Child` which guarantees that the child -/// will be killed and waited-for by this process before being dropped. -struct NoLeakChild { - tenant_id: TenantShardId, - child: Option, -} - -impl Deref for NoLeakChild { - type Target = Child; - - fn deref(&self) -> &Self::Target { - self.child.as_ref().expect("must not use from drop") - } -} - -impl DerefMut for NoLeakChild { - fn deref_mut(&mut self) -> &mut Self::Target { - self.child.as_mut().expect("must not use from drop") - } -} - -impl NoLeakChild { - fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { - let child = command.spawn()?; - Ok(NoLeakChild { - tenant_id, - child: Some(child), - }) - } - - fn kill_and_wait(mut self, cause: WalRedoKillCause) { - let child = match self.child.take() { - Some(child) => child, - None => return, - }; - Self::kill_and_wait_impl(child, cause); - } - - #[instrument(skip_all, fields(pid=child.id(), ?cause))] - fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) { - scopeguard::defer! { - WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc(); - } - let res = child.kill(); - if let Err(e) = res { - // This branch is very unlikely because: - // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. - // - This is the only place that calls .kill() - // - We consume `self`, so, .kill() can't be called twice. - // - If the process exited by itself or was killed by someone else, - // .kill() will still succeed because we haven't wait()'ed yet. - // - // So, if we arrive here, we have really no idea what happened, - // whether the PID stored in self.child is still valid, etc. - // If this function were fallible, we'd return an error, but - // since it isn't, all we can do is log an error and proceed - // with the wait(). - error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process"); - } - - match child.wait() { - Ok(exit_status) => { - info!(exit_status = %exit_status, "wait successful"); - } - Err(e) => { - error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); - } - } - } -} - -impl Drop for NoLeakChild { - fn drop(&mut self) { - let child = match self.child.take() { - Some(child) => child, - None => return, - }; - let tenant_shard_id = self.tenant_id; - // Offload the kill+wait of the child process into the background. - // If someone stops the runtime, we'll leak the child process. - // We can ignore that case because we only stop the runtime on pageserver exit. - tokio::runtime::Handle::current().spawn(async move { - tokio::task::spawn_blocking(move || { - // Intentionally don't inherit the tracing context from whoever is dropping us. - // This thread here is going to outlive of our dropper. - let span = tracing::info_span!( - "walredo", - tenant_id = %tenant_shard_id.tenant_id, - shard_id = %tenant_shard_id.shard_slug() - ); - let _entered = span.enter(); - Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); - }) - .await - }); - } -} - -trait NoLeakChildCommandExt { - fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; -} - -impl NoLeakChildCommandExt for Command { - fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { - NoLeakChild::spawn(tenant_id, self) - } -} - -// Functions for constructing messages to send to the postgres WAL redo -// process. See pgxn/neon_walredo/walredoproc.c for -// explanation of the protocol. - -fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec) { - let len = 4 + 1 + 4 * 4; - - buf.put_u8(b'B'); - buf.put_u32(len as u32); - - tag.ser_into(buf) - .expect("serialize BufferTag should always succeed"); -} - -fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec) { - assert!(base_img.len() == 8192); - - let len = 4 + 1 + 4 * 4 + base_img.len(); - - buf.put_u8(b'P'); - buf.put_u32(len as u32); - tag.ser_into(buf) - .expect("serialize BufferTag should always succeed"); - buf.put(base_img); -} - -fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec) { - let len = 4 + 8 + rec.len(); - - buf.put_u8(b'A'); - buf.put_u32(len as u32); - buf.put_u64(endlsn.0); - buf.put(rec); -} - -fn build_get_page_msg(tag: BufferTag, buf: &mut Vec) { - let len = 4 + 1 + 4 * 4; - - buf.put_u8(b'G'); - buf.put_u32(len as u32); - tag.ser_into(buf) - .expect("serialize BufferTag should always succeed"); -} - #[cfg(test)] mod tests { use super::PostgresRedoManager; @@ -1162,6 +382,7 @@ mod tests { use bytes::Bytes; use pageserver_api::shard::TenantShardId; use std::str::FromStr; + use tracing::Instrument; use utils::{id::TenantId, lsn::Lsn}; #[tokio::test] @@ -1186,6 +407,7 @@ mod tests { short_records(), 14, ) + .instrument(h.span()) .await .unwrap(); @@ -1213,6 +435,7 @@ mod tests { short_records(), 14, ) + .instrument(h.span()) .await .unwrap(); @@ -1233,6 +456,7 @@ mod tests { short_records(), 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ ) + .instrument(h.span()) .await .unwrap_err(); } @@ -1261,6 +485,7 @@ mod tests { // underscored because unused, except for removal at drop _repo_dir: camino_tempfile::Utf8TempDir, manager: PostgresRedoManager, + tenant_shard_id: TenantShardId, } impl RedoHarness { @@ -1277,7 +502,11 @@ mod tests { Ok(RedoHarness { _repo_dir: repo_dir, manager, + tenant_shard_id, }) } + fn span(&self) -> tracing::Span { + tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } } } diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs new file mode 100644 index 0000000000..facf01004c --- /dev/null +++ b/pageserver/src/walredo/apply_neon.rs @@ -0,0 +1,319 @@ +use crate::pgdatadir_mapping::AuxFilesDirectory; +use crate::walrecord::NeonWalRecord; +use anyhow::Context; +use byteorder::{ByteOrder, LittleEndian}; +use bytes::{BufMut, BytesMut}; +use pageserver_api::key::Key; +use pageserver_api::reltag::SlruKind; +use postgres_ffi::pg_constants; +use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; +use postgres_ffi::v14::nonrelfile_utils::{ + mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset, + transaction_id_set_status, +}; +use postgres_ffi::BLCKSZ; +use tracing::*; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; + +/// Can this request be served by neon redo functions +/// or we need to pass it to wal-redo postgres process? +pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { + // Currently, we don't have bespoken Rust code to replay any + // Postgres WAL records. But everything else is handled in neon. + #[allow(clippy::match_like_matches_macro)] + match rec { + NeonWalRecord::Postgres { + will_init: _, + rec: _, + } => false, + _ => true, + } +} + +pub(crate) fn apply_in_neon( + record: &NeonWalRecord, + lsn: Lsn, + key: Key, + page: &mut BytesMut, +) -> Result<(), anyhow::Error> { + match record { + NeonWalRecord::Postgres { + will_init: _, + rec: _, + } => { + anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); + } + NeonWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno, + flags, + } => { + // sanity check that this is modifying the correct relation + let (rel, blknum) = key.to_rel_block().context("invalid record")?; + assert!( + rel.forknum == VISIBILITYMAP_FORKNUM, + "ClearVisibilityMapFlags record on unexpected rel {}", + rel + ); + if let Some(heap_blkno) = *new_heap_blkno { + // Calculate the VM block and offset that corresponds to the heap block. + let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); + let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); + let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); + + // Check that we're modifying the correct VM block. + assert!(map_block == blknum); + + // equivalent to PageGetContents(page) + let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + + map[map_byte as usize] &= !(flags << map_offset); + postgres_ffi::page_set_lsn(page, lsn); + } + + // Repeat for 'old_heap_blkno', if any + if let Some(heap_blkno) = *old_heap_blkno { + let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno); + let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno); + let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno); + + assert!(map_block == blknum); + + let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + + map[map_byte as usize] &= !(flags << map_offset); + postgres_ffi::page_set_lsn(page, lsn); + } + } + // Non-relational WAL records are handled here, with custom code that has the + // same effects as the corresponding Postgres WAL redo function. + NeonWalRecord::ClogSetCommitted { xids, timestamp } => { + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetCommitted record with unexpected key {}", + key + ); + for &xid in xids { + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + + // Check that we're modifying the correct CLOG block. + assert!( + segno == expected_segno, + "ClogSetCommitted record for XID {} with unexpected key {}", + xid, + key + ); + assert!( + blknum == expected_blknum, + "ClogSetCommitted record for XID {} with unexpected key {}", + xid, + key + ); + + transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page); + } + + // Append the timestamp + if page.len() == BLCKSZ as usize + 8 { + page.truncate(BLCKSZ as usize); + } + if page.len() == BLCKSZ as usize { + page.extend_from_slice(×tamp.to_be_bytes()); + } else { + warn!( + "CLOG blk {} in seg {} has invalid size {}", + blknum, + segno, + page.len() + ); + } + } + NeonWalRecord::ClogSetAborted { xids } => { + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetAborted record with unexpected key {}", + key + ); + for &xid in xids { + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + + // Check that we're modifying the correct CLOG block. + assert!( + segno == expected_segno, + "ClogSetAborted record for XID {} with unexpected key {}", + xid, + key + ); + assert!( + blknum == expected_blknum, + "ClogSetAborted record for XID {} with unexpected key {}", + xid, + key + ); + + transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); + } + } + NeonWalRecord::MultixactOffsetCreate { mid, moff } => { + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::MultiXactOffsets, + "MultixactOffsetCreate record with unexpected key {}", + key + ); + // Compute the block and offset to modify. + // See RecordNewMultiXact in PostgreSQL sources. + let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; + let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; + let offset = (entryno * 4) as usize; + + // Check that we're modifying the correct multixact-offsets block. + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + assert!( + segno == expected_segno, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", + mid, + key + ); + assert!( + blknum == expected_blknum, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", + mid, + key + ); + + LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); + } + NeonWalRecord::MultixactMembersCreate { moff, members } => { + let (slru_kind, segno, blknum) = key.to_slru_block().context("invalid record")?; + assert_eq!( + slru_kind, + SlruKind::MultiXactMembers, + "MultixactMembersCreate record with unexpected key {}", + key + ); + for (i, member) in members.iter().enumerate() { + let offset = moff + i as u32; + + // Compute the block and offset to modify. + // See RecordNewMultiXact in PostgreSQL sources. + let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32; + let memberoff = mx_offset_to_member_offset(offset); + let flagsoff = mx_offset_to_flags_offset(offset); + let bshift = mx_offset_to_flags_bitshift(offset); + + // Check that we're modifying the correct multixact-members block. + let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + assert!( + segno == expected_segno, + "MultiXactMembersCreate record for offset {} with unexpected key {}", + moff, + key + ); + assert!( + blknum == expected_blknum, + "MultiXactMembersCreate record for offset {} with unexpected key {}", + moff, + key + ); + + let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); + flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); + flagsval |= member.status << bshift; + LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval); + LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid); + } + } + NeonWalRecord::AuxFile { file_path, content } => { + let mut dir = AuxFilesDirectory::des(page)?; + dir.upsert(file_path.clone(), content.clone()); + + page.clear(); + let mut writer = page.writer(); + dir.ser_into(&mut writer)?; + } + #[cfg(test)] + NeonWalRecord::Test { + append, + clear, + will_init, + } => { + if *will_init { + assert!(*clear, "init record must be clear to ensure correctness"); + } + if *clear { + page.clear(); + } + page.put_slice(append.as_bytes()); + } + } + Ok(()) +} + +#[cfg(test)] +mod test { + use bytes::Bytes; + use pageserver_api::key::AUX_FILES_KEY; + + use super::*; + use std::collections::HashMap; + + /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile + #[test] + fn apply_aux_file_deltas() -> anyhow::Result<()> { + let base_dir = AuxFilesDirectory { + files: HashMap::from([ + ("two".to_string(), Bytes::from_static(b"content0")), + ("three".to_string(), Bytes::from_static(b"contentX")), + ]), + }; + let base_image = AuxFilesDirectory::ser(&base_dir)?; + + let deltas = vec![ + // Insert + NeonWalRecord::AuxFile { + file_path: "one".to_string(), + content: Some(Bytes::from_static(b"content1")), + }, + // Update + NeonWalRecord::AuxFile { + file_path: "two".to_string(), + content: Some(Bytes::from_static(b"content99")), + }, + // Delete + NeonWalRecord::AuxFile { + file_path: "three".to_string(), + content: None, + }, + ]; + + let file_path = AUX_FILES_KEY; + let mut page = BytesMut::from_iter(base_image); + + for record in deltas { + apply_in_neon(&record, Lsn(8), file_path, &mut page)?; + } + + let reconstructed = AuxFilesDirectory::des(&page)?; + let expect = HashMap::from([ + ("one".to_string(), Bytes::from_static(b"content1")), + ("two".to_string(), Bytes::from_static(b"content99")), + ]); + + assert_eq!(reconstructed.files, expect); + + Ok(()) + } +} diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs new file mode 100644 index 0000000000..9140d4f6aa --- /dev/null +++ b/pageserver/src/walredo/process.rs @@ -0,0 +1,382 @@ +mod no_leak_child; +/// The IPC protocol that pageserver and walredo process speak over their shared pipe. +mod protocol; + +use self::no_leak_child::NoLeakChild; +use crate::{ + config::PageServerConf, + metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, + span::debug_assert_current_span_has_tenant_id, + walrecord::NeonWalRecord, +}; +use anyhow::Context; +use bytes::Bytes; +use pageserver_api::{reltag::RelTag, shard::TenantShardId}; +use postgres_ffi::BLCKSZ; +#[cfg(feature = "testing")] +use std::sync::atomic::AtomicUsize; +use std::{ + collections::VecDeque, + process::{Command, Stdio}, + time::Duration, +}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tracing::{debug, error, instrument, Instrument}; +use utils::{lsn::Lsn, poison::Poison}; + +pub struct WalRedoProcess { + #[allow(dead_code)] + conf: &'static PageServerConf, + #[cfg(feature = "testing")] + tenant_shard_id: TenantShardId, + // Some() on construction, only becomes None on Drop. + child: Option, + stdout: tokio::sync::Mutex>, + stdin: tokio::sync::Mutex>, + /// Counter to separate same sized walredo inputs failing at the same millisecond. + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize, +} + +struct ProcessInput { + stdin: tokio::process::ChildStdin, + n_requests: usize, +} + +struct ProcessOutput { + stdout: tokio::process::ChildStdout, + pending_responses: VecDeque>, + n_processed_responses: usize, +} + +impl WalRedoProcess { + // + // Start postgres binary in special WAL redo mode. + // + #[instrument(skip_all,fields(pg_version=pg_version))] + pub(crate) fn launch( + conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, + pg_version: u32, + ) -> anyhow::Result { + crate::span::debug_assert_current_span_has_tenant_id(); + + let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible. + let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?; + + use no_leak_child::NoLeakChildCommandExt; + // Start postgres itself + let child = Command::new(pg_bin_dir_path.join("postgres")) + // the first arg must be --wal-redo so the child process enters into walredo mode + .arg("--wal-redo") + // the child doesn't process this arg, but, having it in the argv helps indentify the + // walredo process for a particular tenant when debugging a pagserver + .args(["--tenant-shard-id", &format!("{tenant_shard_id}")]) + .stdin(Stdio::piped()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .env_clear() + .env("LD_LIBRARY_PATH", &pg_lib_dir_path) + .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) + // NB: The redo process is not trusted after we sent it the first + // walredo work. Before that, it is trusted. Specifically, we trust + // it to + // 1. close all file descriptors except stdin, stdout, stderr because + // pageserver might not be 100% diligent in setting FD_CLOEXEC on all + // the files it opens, and + // 2. to use seccomp to sandbox itself before processing the first + // walredo request. + .spawn_no_leak_child(tenant_shard_id) + .context("spawn process")?; + WAL_REDO_PROCESS_COUNTERS.started.inc(); + let mut child = scopeguard::guard(child, |child| { + error!("killing wal-redo-postgres process due to a problem during launch"); + child.kill_and_wait(WalRedoKillCause::Startup); + }); + + let stdin = child.stdin.take().unwrap(); + let stdout = child.stdout.take().unwrap(); + let stderr = child.stderr.take().unwrap(); + let stderr = tokio::process::ChildStderr::from_std(stderr) + .context("convert to tokio::ChildStderr")?; + let stdin = + tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?; + let stdout = tokio::process::ChildStdout::from_std(stdout) + .context("convert to tokio::ChildStdout")?; + + // all fallible operations post-spawn are complete, so get rid of the guard + let child = scopeguard::ScopeGuard::into_inner(child); + + tokio::spawn( + async move { + scopeguard::defer! { + debug!("wal-redo-postgres stderr_logger_task finished"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc(); + } + debug!("wal-redo-postgres stderr_logger_task started"); + crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc(); + + use tokio::io::AsyncBufReadExt; + let mut stderr_lines = tokio::io::BufReader::new(stderr); + let mut buf = Vec::new(); + let res = loop { + buf.clear(); + // TODO we don't trust the process to cap its stderr length. + // Currently it can do unbounded Vec allocation. + match stderr_lines.read_until(b'\n', &mut buf).await { + Ok(0) => break Ok(()), // eof + Ok(num_bytes) => { + let output = String::from_utf8_lossy(&buf[..num_bytes]); + error!(%output, "received output"); + } + Err(e) => { + break Err(e); + } + } + }; + match res { + Ok(()) => (), + Err(e) => { + error!(error=?e, "failed to read from walredo stderr"); + } + } + }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version)) + ); + + Ok(Self { + conf, + #[cfg(feature = "testing")] + tenant_shard_id, + child: Some(child), + stdin: tokio::sync::Mutex::new(Poison::new( + "stdin", + ProcessInput { + stdin, + n_requests: 0, + }, + )), + stdout: tokio::sync::Mutex::new(Poison::new( + "stdout", + ProcessOutput { + stdout, + pending_responses: VecDeque::new(), + n_processed_responses: 0, + }, + )), + #[cfg(feature = "testing")] + dump_sequence: AtomicUsize::default(), + }) + } + + pub(crate) fn id(&self) -> u32 { + self.child + .as_ref() + .expect("must not call this during Drop") + .id() + } + + /// Apply given WAL records ('records') over an old page image. Returns + /// new page image. + /// + /// # Cancel-Safety + /// + /// Cancellation safe. + #[instrument(skip_all, fields(pid=%self.id()))] + pub(crate) async fn apply_wal_records( + &self, + rel: RelTag, + blknum: u32, + base_img: &Option, + records: &[(Lsn, NeonWalRecord)], + wal_redo_timeout: Duration, + ) -> anyhow::Result { + debug_assert_current_span_has_tenant_id(); + + let tag = protocol::BufferTag { rel, blknum }; + + // Serialize all the messages to send the WAL redo process first. + // + // This could be problematic if there are millions of records to replay, + // but in practice the number of records is usually so small that it doesn't + // matter, and it's better to keep this code simple. + // + // Most requests start with a before-image with BLCKSZ bytes, followed by + // by some other WAL records. Start with a buffer that can hold that + // comfortably. + let mut writebuf: Vec = Vec::with_capacity((BLCKSZ as usize) * 3); + protocol::build_begin_redo_for_block_msg(tag, &mut writebuf); + if let Some(img) = base_img { + protocol::build_push_page_msg(tag, img, &mut writebuf); + } + for (lsn, rec) in records.iter() { + if let NeonWalRecord::Postgres { + will_init: _, + rec: postgres_rec, + } = rec + { + protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); + } else { + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); + } + } + protocol::build_get_page_msg(tag, &mut writebuf); + WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64); + + let Ok(res) = + tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await + else { + anyhow::bail!("WAL redo timed out"); + }; + + if res.is_err() { + // not all of these can be caused by this particular input, however these are so rare + // in tests so capture all. + self.record_and_log(&writebuf); + } + + res + } + + /// # Cancel-Safety + /// + /// When not polled to completion (e.g. because in `tokio::select!` another + /// branch becomes ready before this future), concurrent and subsequent + /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls. + /// Dispose of this process instance and create a new one. + async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result { + let request_no = { + let mut lock_guard = self.stdin.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let input = poison_guard.data_mut(); + input + .stdin + .write_all(writebuf) + .await + .context("write to walredo stdin")?; + let request_no = input.n_requests; + input.n_requests += 1; + poison_guard.disarm(); + request_no + }; + + // To improve walredo performance we separate sending requests and receiving + // responses. Them are protected by different mutexes (output and input). + // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process + // then there is not warranty that T1 will first granted output mutex lock. + // To address this issue we maintain number of sent requests, number of processed + // responses and ring buffer with pending responses. After sending response + // (under input mutex), threads remembers request number. Then it releases + // input mutex, locks output mutex and fetch in ring buffer all responses until + // its stored request number. The it takes correspondent element from + // pending responses ring buffer and truncate all empty elements from the front, + // advancing processed responses number. + + let mut lock_guard = self.stdout.lock().await; + let mut poison_guard = lock_guard.check_and_arm()?; + let output = poison_guard.data_mut(); + let n_processed_responses = output.n_processed_responses; + while n_processed_responses + output.pending_responses.len() <= request_no { + // We expect the WAL redo process to respond with an 8k page image. We read it + // into this buffer. + let mut resultbuf = vec![0; BLCKSZ.into()]; + output + .stdout + .read_exact(&mut resultbuf) + .await + .context("read walredo stdout")?; + output + .pending_responses + .push_back(Some(Bytes::from(resultbuf))); + } + // Replace our request's response with None in `pending_responses`. + // Then make space in the ring buffer by clearing out any seqence of contiguous + // `None`'s from the front of `pending_responses`. + // NB: We can't pop_front() because other requests' responses because another + // requester might have grabbed the output mutex before us: + // T1: grab input mutex + // T1: send request_no 23 + // T1: release input mutex + // T2: grab input mutex + // T2: send request_no 24 + // T2: release input mutex + // T2: grab output mutex + // T2: n_processed_responses + output.pending_responses.len() <= request_no + // 23 0 24 + // T2: enters poll loop that reads stdout + // T2: put response for 23 into pending_responses + // T2: put response for 24 into pending_resposnes + // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back + // T2: takes its response_24 + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Some(response_23) None Back + // T2: releases output mutex + // T1: grabs output mutex + // T1: n_processed_responses + output.pending_responses.len() > request_no + // 23 2 23 + // T1: skips poll loop that reads stdout + // T1: takes its response_23 + // pending_responses now looks like this: Front None None Back + // T2: does the while loop below + // pending_responses now looks like this: Front Back + // n_processed_responses now has value 25 + let res = output.pending_responses[request_no - n_processed_responses] + .take() + .expect("we own this request_no, nobody else is supposed to take it"); + while let Some(front) = output.pending_responses.front() { + if front.is_none() { + output.pending_responses.pop_front(); + output.n_processed_responses += 1; + } else { + break; + } + } + poison_guard.disarm(); + Ok(res) + } + + #[cfg(feature = "testing")] + fn record_and_log(&self, writebuf: &[u8]) { + use std::sync::atomic::Ordering; + + let millis = std::time::SystemTime::now() + .duration_since(std::time::SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis(); + + let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed); + + // these files will be collected to an allure report + let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len()); + + let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename); + + use std::io::Write; + let res = std::fs::OpenOptions::new() + .write(true) + .create_new(true) + .read(true) + .open(path) + .and_then(|mut f| f.write_all(writebuf)); + + // trip up allowed_errors + if let Err(e) = res { + tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}"); + } else { + tracing::error!(filename, "erroring walredo input saved"); + } + } + + #[cfg(not(feature = "testing"))] + fn record_and_log(&self, _: &[u8]) {} +} + +impl Drop for WalRedoProcess { + fn drop(&mut self) { + self.child + .take() + .expect("we only do this once") + .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop); + // no way to wait for stderr_logger_task from Drop because that is async only + } +} diff --git a/pageserver/src/walredo/process/no_leak_child.rs b/pageserver/src/walredo/process/no_leak_child.rs new file mode 100644 index 0000000000..1a0d7039df --- /dev/null +++ b/pageserver/src/walredo/process/no_leak_child.rs @@ -0,0 +1,124 @@ +use tracing::instrument; +use tracing::{error, info}; + +use crate::metrics::WalRedoKillCause; +use crate::metrics::WAL_REDO_PROCESS_COUNTERS; + +use std::io; +use std::process::Command; + +use std::ops::DerefMut; + +use std::ops::Deref; + +use std::process::Child; + +use pageserver_api::shard::TenantShardId; + +/// Wrapper type around `std::process::Child` which guarantees that the child +/// will be killed and waited-for by this process before being dropped. +pub(crate) struct NoLeakChild { + pub(crate) tenant_id: TenantShardId, + pub(crate) child: Option, +} + +impl Deref for NoLeakChild { + type Target = Child; + + fn deref(&self) -> &Self::Target { + self.child.as_ref().expect("must not use from drop") + } +} + +impl DerefMut for NoLeakChild { + fn deref_mut(&mut self) -> &mut Self::Target { + self.child.as_mut().expect("must not use from drop") + } +} + +impl NoLeakChild { + pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result { + let child = command.spawn()?; + Ok(NoLeakChild { + tenant_id, + child: Some(child), + }) + } + + pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + Self::kill_and_wait_impl(child, cause); + } + + #[instrument(skip_all, fields(pid=child.id(), ?cause))] + pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) { + scopeguard::defer! { + WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc(); + } + let res = child.kill(); + if let Err(e) = res { + // This branch is very unlikely because: + // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. + // - This is the only place that calls .kill() + // - We consume `self`, so, .kill() can't be called twice. + // - If the process exited by itself or was killed by someone else, + // .kill() will still succeed because we haven't wait()'ed yet. + // + // So, if we arrive here, we have really no idea what happened, + // whether the PID stored in self.child is still valid, etc. + // If this function were fallible, we'd return an error, but + // since it isn't, all we can do is log an error and proceed + // with the wait(). + error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process"); + } + + match child.wait() { + Ok(exit_status) => { + info!(exit_status = %exit_status, "wait successful"); + } + Err(e) => { + error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)"); + } + } + } +} + +impl Drop for NoLeakChild { + fn drop(&mut self) { + let child = match self.child.take() { + Some(child) => child, + None => return, + }; + let tenant_shard_id = self.tenant_id; + // Offload the kill+wait of the child process into the background. + // If someone stops the runtime, we'll leak the child process. + // We can ignore that case because we only stop the runtime on pageserver exit. + tokio::runtime::Handle::current().spawn(async move { + tokio::task::spawn_blocking(move || { + // Intentionally don't inherit the tracing context from whoever is dropping us. + // This thread here is going to outlive of our dropper. + let span = tracing::info_span!( + "walredo", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug() + ); + let _entered = span.enter(); + Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop); + }) + .await + }); + } +} + +pub(crate) trait NoLeakChildCommandExt { + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result; +} + +impl NoLeakChildCommandExt for Command { + fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result { + NoLeakChild::spawn(tenant_id, self) + } +} diff --git a/pageserver/src/walredo/process/protocol.rs b/pageserver/src/walredo/process/protocol.rs new file mode 100644 index 0000000000..b703344cc8 --- /dev/null +++ b/pageserver/src/walredo/process/protocol.rs @@ -0,0 +1,57 @@ +use bytes::BufMut; +use pageserver_api::reltag::RelTag; +use serde::Serialize; +use utils::bin_ser::BeSer; +use utils::lsn::Lsn; + +/// +/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster. +/// +/// In Postgres `BufferTag` structure is used for exactly the same purpose. +/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91). +/// +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)] +pub(crate) struct BufferTag { + pub rel: RelTag, + pub blknum: u32, +} + +pub(crate) fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec) { + let len = 4 + 1 + 4 * 4; + + buf.put_u8(b'B'); + buf.put_u32(len as u32); + + tag.ser_into(buf) + .expect("serialize BufferTag should always succeed"); +} + +pub(crate) fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec) { + assert!(base_img.len() == 8192); + + let len = 4 + 1 + 4 * 4 + base_img.len(); + + buf.put_u8(b'P'); + buf.put_u32(len as u32); + tag.ser_into(buf) + .expect("serialize BufferTag should always succeed"); + buf.put(base_img); +} + +pub(crate) fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec) { + let len = 4 + 8 + rec.len(); + + buf.put_u8(b'A'); + buf.put_u32(len as u32); + buf.put_u64(endlsn.0); + buf.put(rec); +} + +pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec) { + let len = 4 + 1 + 4 * 4; + + buf.put_u8(b'G'); + buf.put_u32(len as u32); + tag.ser_into(buf) + .expect("serialize BufferTag should always succeed"); +} diff --git a/patches/pg_anon.patch b/patches/pg_anon.patch new file mode 100644 index 0000000000..15dfd3c5a0 --- /dev/null +++ b/patches/pg_anon.patch @@ -0,0 +1,223 @@ +commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f +Author: Alexey Masterov +Date: Fri May 31 06:34:26 2024 +0000 + + These alternative expected files were added to consider the neon features + +diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out +new file mode 100644 +index 0000000..2539cfd +--- /dev/null ++++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out +@@ -0,0 +1,101 @@ ++BEGIN; ++CREATE EXTENSION anon CASCADE; ++NOTICE: installing required extension "pgcrypto" ++SELECT anon.init(); ++ init ++------ ++ t ++(1 row) ++ ++CREATE ROLE mallory_the_masked_user; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED'; ++CREATE TABLE t1(i INT); ++ALTER TABLE t1 ADD COLUMN t TEXT; ++SECURITY LABEL FOR anon ON COLUMN t1.t ++IS 'MASKED WITH VALUE NULL'; ++INSERT INTO t1 VALUES (1,'test'); ++-- ++-- We're checking the owner's permissions ++-- ++-- see ++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions ++-- ++SET ROLE mallory_the_masked_user; ++SELECT anon.pseudo_first_name(0) IS NOT NULL; ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ PERFORM anon.init(); ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ PERFORM anon.anonymize_table('t1'); ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++-- SHOULD FAIL ++SAVEPOINT fail_start_engine; ++SELECT anon.start_dynamic_masking(); ++ERROR: Only supersusers can start the dynamic masking engine. ++CONTEXT: PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE ++ROLLBACK TO fail_start_engine; ++RESET ROLE; ++SELECT anon.start_dynamic_masking(); ++ start_dynamic_masking ++----------------------- ++ t ++(1 row) ++ ++SET ROLE mallory_the_masked_user; ++SELECT * FROM mask.t1; ++ i | t ++---+--- ++ 1 | ++(1 row) ++ ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ SELECT * FROM public.t1; ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++-- SHOULD FAIL ++SAVEPOINT fail_stop_engine; ++SELECT anon.stop_dynamic_masking(); ++ERROR: Only supersusers can stop the dynamic masking engine. ++CONTEXT: PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE ++ROLLBACK TO fail_stop_engine; ++RESET ROLE; ++SELECT anon.stop_dynamic_masking(); ++NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually. ++ stop_dynamic_masking ++---------------------- ++ t ++(1 row) ++ ++SET ROLE mallory_the_masked_user; ++SELECT COUNT(*)=1 FROM anon.pg_masking_rules; ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- SHOULD FAIL ++SAVEPOINT fail_seclabel_on_role; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL; ++ERROR: permission denied ++DETAIL: The current user must have the CREATEROLE attribute. ++ROLLBACK TO fail_seclabel_on_role; ++ROLLBACK; +diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out +new file mode 100644 +index 0000000..8b090fe +--- /dev/null ++++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out +@@ -0,0 +1,104 @@ ++BEGIN; ++CREATE EXTENSION anon CASCADE; ++NOTICE: installing required extension "pgcrypto" ++SELECT anon.init(); ++ init ++------ ++ t ++(1 row) ++ ++CREATE ROLE oscar_the_owner; ++ALTER DATABASE :DBNAME OWNER TO oscar_the_owner; ++CREATE ROLE mallory_the_masked_user; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED'; ++-- ++-- We're checking the owner's permissions ++-- ++-- see ++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions ++-- ++SET ROLE oscar_the_owner; ++SELECT anon.pseudo_first_name(0) IS NOT NULL; ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ PERFORM anon.init(); ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++CREATE TABLE t1(i INT); ++ALTER TABLE t1 ADD COLUMN t TEXT; ++SECURITY LABEL FOR anon ON COLUMN t1.t ++IS 'MASKED WITH VALUE NULL'; ++INSERT INTO t1 VALUES (1,'test'); ++SELECT anon.anonymize_table('t1'); ++ anonymize_table ++----------------- ++ t ++(1 row) ++ ++SELECT * FROM t1; ++ i | t ++---+--- ++ 1 | ++(1 row) ++ ++UPDATE t1 SET t='test' WHERE i=1; ++-- SHOULD FAIL ++SAVEPOINT fail_start_engine; ++SELECT anon.start_dynamic_masking(); ++ start_dynamic_masking ++----------------------- ++ t ++(1 row) ++ ++ROLLBACK TO fail_start_engine; ++RESET ROLE; ++SELECT anon.start_dynamic_masking(); ++ start_dynamic_masking ++----------------------- ++ t ++(1 row) ++ ++SET ROLE oscar_the_owner; ++SELECT * FROM t1; ++ i | t ++---+------ ++ 1 | test ++(1 row) ++ ++--SELECT * FROM mask.t1; ++-- SHOULD FAIL ++SAVEPOINT fail_stop_engine; ++SELECT anon.stop_dynamic_masking(); ++ERROR: permission denied for schema mask ++CONTEXT: SQL statement "DROP VIEW mask.t1;" ++PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE ++SQL statement "SELECT anon.mask_drop_view(oid) ++ FROM pg_catalog.pg_class ++ WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE ++ AND relkind IN ('r','p','f')" ++PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM ++ROLLBACK TO fail_stop_engine; ++RESET ROLE; ++SELECT anon.stop_dynamic_masking(); ++NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually. ++ stop_dynamic_masking ++---------------------- ++ t ++(1 row) ++ ++SET ROLE oscar_the_owner; ++-- SHOULD FAIL ++SAVEPOINT fail_seclabel_on_role; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL; ++ERROR: permission denied ++DETAIL: The current user must have the CREATEROLE attribute. ++ROLLBACK TO fail_seclabel_on_role; ++ROLLBACK; diff --git a/patches/pg_cron.patch b/patches/pg_cron.patch new file mode 100644 index 0000000000..c2b648c20c --- /dev/null +++ b/patches/pg_cron.patch @@ -0,0 +1,19 @@ +commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master) +Author: Alexey Masterov +Date: Fri Jun 7 19:23:42 2024 +0000 + + Disable REGRESS_OPTIONS causing initdb + +diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile +index 053314c..fbd5fb5 100644 +--- a/ext-src/pg_cron-src/Makefile ++++ b/ext-src/pg_cron-src/Makefile +@@ -5,7 +5,7 @@ EXTENSION = pg_cron + DATA_built = $(EXTENSION)--1.0.sql + DATA = $(wildcard $(EXTENSION)--*--*.sql) + +-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check ++#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check + REGRESS = pg_cron-test + + # compilation configuration diff --git a/patches/pg_hintplan.patch b/patches/pg_hintplan.patch new file mode 100644 index 0000000000..61a5ecbb90 --- /dev/null +++ b/patches/pg_hintplan.patch @@ -0,0 +1,39 @@ +commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master) +Author: Alexey Masterov +Date: Thu Jun 6 08:02:42 2024 +0000 + + Patch expected files to consider Neon's log messages + +diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out +index da723b8..f8d0102 100644 +--- a/ext-src/pg_hint_plan-src/expected/ut-A.out ++++ b/ext-src/pg_hint_plan-src/expected/ut-A.out +@@ -9,13 +9,16 @@ SET search_path TO public; + ---- + -- No.A-1-1-3 + CREATE EXTENSION pg_hint_plan; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan + -- No.A-1-2-3 + DROP EXTENSION pg_hint_plan; + -- No.A-1-1-4 + CREATE SCHEMA other_schema; + CREATE EXTENSION pg_hint_plan SCHEMA other_schema; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan + ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" + CREATE EXTENSION pg_hint_plan; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan + DROP SCHEMA other_schema; + ---- + ---- No. A-5-1 comment pattern +diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out +index d372459..6282afe 100644 +--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out ++++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out +@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on; + SET client_min_messages TO LOG; + SET pg_hint_plan.enable_hint TO on; + CREATE EXTENSION file_fdw; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw + CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; + CREATE USER MAPPING FOR PUBLIC SERVER file_server; + CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/patches/pgvector.patch b/patches/pgvector.patch new file mode 100644 index 0000000000..3e1ffcaaaf --- /dev/null +++ b/patches/pgvector.patch @@ -0,0 +1,62 @@ +diff --git a/src/hnswbuild.c b/src/hnswbuild.c +index dcfb2bd..d5189ee 100644 +--- a/src/hnswbuild.c ++++ b/src/hnswbuild.c +@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) + + hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(indexRel)); ++#endif ++ + /* Perform inserts */ + HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel)); ++#endif ++ + /* Close relations within worker */ + index_close(indexRel, indexLockmode); + table_close(heapRel, heapLockmode); +@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, + SeedRandom(42); + #endif + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + InitBuildState(buildstate, heap, index, indexInfo, forkNum); + + BuildGraph(buildstate, forkNum); + +- if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); ++#endif ++ ++ if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) { + log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true); ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, ++ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ } ++#endif ++ } ++ ++#ifdef NEON_SMGR ++ smgr_end_unlogged_build(RelationGetSmgr(index)); ++#endif + + FreeBuildState(buildstate); + } diff --git a/pgxn/.dir-locals.el b/pgxn/.dir-locals.el new file mode 100644 index 0000000000..ab6208b698 --- /dev/null +++ b/pgxn/.dir-locals.el @@ -0,0 +1,19 @@ +;; see also src/tools/editors/emacs.samples for more complete settings + +((c-mode . ((c-basic-offset . 4) + (c-file-style . "bsd") + (fill-column . 78) + (indent-tabs-mode . t) + (tab-width . 4))) + (nxml-mode . ((fill-column . 78) + (indent-tabs-mode . nil))) + (perl-mode . ((perl-indent-level . 4) + (perl-continued-statement-offset . 2) + (perl-continued-brace-offset . -2) + (perl-brace-offset . 0) + (perl-brace-imaginary-offset . 0) + (perl-label-offset . -2) + (indent-tabs-mode . t) + (tab-width . 4))) + (sgml-mode . ((fill-column . 78) + (indent-tabs-mode . nil)))) diff --git a/pgxn/.editorconfig b/pgxn/.editorconfig new file mode 100644 index 0000000000..d69a3d1dc4 --- /dev/null +++ b/pgxn/.editorconfig @@ -0,0 +1,14 @@ +root = true + +[*.{c,h,l,y,pl,pm}] +indent_style = tab +indent_size = tab +tab_width = 4 + +[*.{sgml,xml}] +indent_style = space +indent_size = 1 + +[*.xsl] +indent_style = space +indent_size = 2 diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c index 45bf78ed3b..e624cb831f 100644 --- a/pgxn/hnsw/hnsw.c +++ b/pgxn/hnsw/hnsw.c @@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested) struct sysinfo si; Size total; if (sysinfo(&si) < 0) - elog(ERROR, "Failed to get amount of RAM: %n"); + elog(ERROR, "Failed to get amount of RAM: %m"); total = si.totalram*si.mem_unit; if ((Size)NBuffers*BLCKSZ + requested >= total) diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index c6b224a14d..cd316dbb91 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -14,14 +14,15 @@ OBJS = \ relsize_cache.o \ walproposer.o \ walproposer_pg.o \ - control_plane_connector.o + control_plane_connector.o \ + walsender_hooks.o PG_CPPFLAGS = -I$(libpq_srcdir) SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index e467a9c43a..93252e6b29 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -35,16 +35,17 @@ #include "utils/memutils.h" #include "utils/jsonb.h" +#include "control_plane_connector.h" +#include "neon_utils.h" + static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; +static const char *jwt_token = NULL; + /* GUCs */ static char *ConsoleURL = NULL; static bool ForwardDDL = true; -/* Curl structures for sending the HTTP requests */ -static CURL *CurlHandle; -static struct curl_slist *ContentHeader = NULL; - /* * CURL docs say that this buffer must exist until we call curl_easy_cleanup * (which we never do), so we make this a static @@ -113,6 +114,8 @@ ConstructDeltaMessage() if (RootTable.db_table) { JsonbValue dbs; + HASH_SEQ_STATUS status; + DbEntry *entry; dbs.type = jbvString; dbs.val.string.val = "dbs"; @@ -120,9 +123,6 @@ ConstructDeltaMessage() pushJsonbValue(&state, WJB_KEY, &dbs); pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL); - HASH_SEQ_STATUS status; - DbEntry *entry; - hash_seq_init(&status, RootTable.db_table); while ((entry = hash_seq_search(&status)) != NULL) { @@ -168,8 +168,9 @@ ConstructDeltaMessage() #else const char *logdetail; #endif + char *encrypted_password; PushKeyValue(&state, "password", (char *) entry->password); - char *encrypted_password = get_role_password(entry->name, &logdetail); + encrypted_password = get_role_password(entry->name, &logdetail); if (encrypted_password) { @@ -226,6 +227,8 @@ ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata) static void SendDeltasToControlPlane() { + static CURL *handle = NULL; + if (!RootTable.db_table && !RootTable.role_table) return; if (!ConsoleURL) @@ -236,29 +239,57 @@ SendDeltasToControlPlane() if (!ForwardDDL) return; - char *message = ConstructDeltaMessage(); - ErrorString str = {}; + if (handle == NULL) + { + struct curl_slist *headers = NULL; - curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH"); - curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader); - curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message); - curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL); - curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf); - curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ ); - curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str); - curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback); + headers = curl_slist_append(headers, "Content-Type: application/json"); + if (headers == NULL) + { + elog(ERROR, "Failed to set Content-Type header"); + } + + if (jwt_token) + { + char auth_header[8192]; + + snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token); + headers = curl_slist_append(headers, auth_header); + if (headers == NULL) + { + elog(ERROR, "Failed to set Authorization header"); + } + } + + handle = alloc_curl_handle(); + + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "PATCH"); + curl_easy_setopt(handle, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(handle, CURLOPT_URL, ConsoleURL); + curl_easy_setopt(handle, CURLOPT_ERRORBUFFER, CurlErrorBuf); + curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); + curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback); + } + + char *message = ConstructDeltaMessage(); + ErrorString str; + + str.size = 0; + + curl_easy_setopt(handle, CURLOPT_POSTFIELDS, message); + curl_easy_setopt(handle, CURLOPT_WRITEDATA, &str); const int num_retries = 5; - int curl_status; + CURLcode curl_status; for (int i = 0; i < num_retries; i++) { - if ((curl_status = curl_easy_perform(CurlHandle)) == 0) + if ((curl_status = curl_easy_perform(handle)) == 0) break; elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf); pg_usleep(1000 * 1000); } - if (curl_status != 0) + if (curl_status != CURLE_OK) { elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf); } @@ -266,13 +297,11 @@ SendDeltasToControlPlane() { long response_code; - if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION) + if (curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION) { - bool error_exists = str.size != 0; - if (response_code != 200) { - if (error_exists) + if (str.size != 0) { elog(ERROR, "Received HTTP code %ld from control plane: %s", @@ -637,7 +666,7 @@ HandleAlterRole(AlterRoleStmt *stmt) ListCell *option; const char *role_name = stmt->role->rolename; - if (RoleIsNeonSuperuser(role_name)) + if (RoleIsNeonSuperuser(role_name) && !superuser()) elog(ERROR, "can't ALTER neon_superuser"); foreach(option, stmt->options) @@ -803,7 +832,7 @@ NeonProcessUtility( } } -extern void +void InitControlPlaneConnector() { PreviousProcessUtilityHook = ProcessUtility_hook; @@ -835,34 +864,10 @@ InitControlPlaneConnector() NULL, NULL); - const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); - + jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN"); if (!jwt_token) { elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated"); } - if (curl_global_init(CURL_GLOBAL_DEFAULT)) - { - elog(ERROR, "Failed to initialize curl"); - } - if ((CurlHandle = curl_easy_init()) == NULL) - { - elog(ERROR, "Failed to initialize curl handle"); - } - if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL) - { - elog(ERROR, "Failed to initialize content header"); - } - - if (jwt_token) - { - char auth_header[8192]; - - snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token); - if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL) - { - elog(ERROR, "Failed to initialize authorization header"); - } - } } diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h index 12d6a97562..7eed449200 100644 --- a/pgxn/neon/control_plane_connector.h +++ b/pgxn/neon/control_plane_connector.h @@ -1,6 +1,6 @@ #ifndef CONTROL_PLANE_CONNECTOR_H #define CONTROL_PLANE_CONNECTOR_H -void InitControlPlaneConnector(); +void InitControlPlaneConnector(void); #endif diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c index d9a75142f1..e38af08f89 100644 --- a/pgxn/neon/extension_server.c +++ b/pgxn/neon/extension_server.c @@ -14,6 +14,9 @@ #include "utils/guc.h" +#include "extension_server.h" +#include "neon_utils.h" + static int extension_server_port = 0; static download_extension_file_hook_type prev_download_extension_file_hook = NULL; @@ -31,15 +34,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL static bool neon_download_extension_file_http(const char *filename, bool is_library) { - CURL *curl; + static CURL *handle = NULL; + CURLcode res; char *compute_ctl_url; - char *postdata; bool ret = false; - if ((curl = curl_easy_init()) == NULL) + if (handle == NULL) { - elog(ERROR, "Failed to initialize curl handle"); + handle = alloc_curl_handle(); + + curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST"); + curl_easy_setopt(handle, CURLOPT_TIMEOUT, 3L /* seconds */ ); } compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", @@ -47,28 +53,22 @@ neon_download_extension_file_http(const char *filename, bool is_library) elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url); - curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST"); - curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ ); + curl_easy_setopt(handle, CURLOPT_URL, compute_ctl_url); - if (curl) + /* Perform the request, res will get the return code */ + res = curl_easy_perform(handle); + /* Check for errors */ + if (res == CURLE_OK) { - /* Perform the request, res will get the return code */ - res = curl_easy_perform(curl); - /* Check for errors */ - if (res == CURLE_OK) - { - ret = true; - } - else - { - /* Don't error here because postgres will try to find the file */ - /* and will fail with some proper error message if it's not found. */ - elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); - } - - /* always cleanup */ - curl_easy_cleanup(curl); + ret = true; + } + else + { + /* + * Don't error here because postgres will try to find the file and will + * fail with some proper error message if it's not found. + */ + elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); } return ret; diff --git a/pgxn/neon/extension_server.h b/pgxn/neon/extension_server.h new file mode 100644 index 0000000000..3e67708b85 --- /dev/null +++ b/pgxn/neon/extension_server.h @@ -0,0 +1,17 @@ +/*------------------------------------------------------------------------- + * + * extension_server.h + * Request compute_ctl to download extension files. + * + * IDENTIFICATION + * contrib/neon/extension_server.h + * + *------------------------------------------------------------------------- + */ + +#ifndef EXTENSION_SERVER_H +#define EXTENSION_SERVER_H + +void pg_init_extension_server(void); + +#endif /* EXTENSION_SERVER_H */ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 6725ce8fff..25275ef31f 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -25,6 +25,8 @@ #include "funcapi.h" #include "miscadmin.h" #include "pagestore_client.h" +#include "common/hashfn.h" +#include "lib/hyperloglog.h" #include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR @@ -60,6 +62,7 @@ #define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ #define MB ((uint64)1024*1024) +#define HYPER_LOG_LOG_BIT_WIDTH 10 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) typedef struct FileCacheEntry @@ -84,6 +87,8 @@ typedef struct FileCacheControl uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ + hyperLogLogState wss_estimation; /* estimation of wroking set size */ + uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1]; } FileCacheControl; static HTAB *lfc_hash; @@ -232,6 +237,14 @@ lfc_shmem_startup(void) lfc_ctl->writes = 0; dlist_init(&lfc_ctl->lru); + /* Initialize hyper-log-log structure for estimating working set size */ + initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH); + + /* We need hashes in shared memory */ + pfree(lfc_ctl->wss_estimation.hashesArr); + memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes; + /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); if (fd < 0) @@ -308,13 +321,16 @@ lfc_change_limit_hook(int newval, void *extra) Assert(victim->access_count == 0); #ifdef FALLOC_FL_PUNCH_HOLE if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0) - elog(LOG, "Failed to punch hole in file: %m"); + neon_log(LOG, "Failed to punch hole in file: %m"); #endif hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); lfc_ctl->used -= 1; } lfc_ctl->limit = new_size; - elog(DEBUG1, "set local file cache limit to %d", new_size); + if (new_size == 0) { + lfc_ctl->generation += 1; + } + neon_log(DEBUG1, "set local file cache limit to %d", new_size); LWLockRelease(lfc_lock); } @@ -327,7 +343,7 @@ lfc_init(void) * shared_preload_libraries. */ if (!process_shared_preload_libraries_in_progress) - elog(ERROR, "Neon module should be loaded via shared_preload_libraries"); + neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries"); DefineCustomIntVariable("neon.max_file_cache_size", @@ -526,10 +542,16 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, } entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + + /* Approximate working set */ + tag.blockNum = blkno; + addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) { /* Page is not cached */ lfc_ctl->misses += 1; + pgBufferUsage.file_cache.misses += 1; LWLockRelease(lfc_lock); return false; } @@ -555,6 +577,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { Assert(LFC_ENABLED()); lfc_ctl->hits += 1; + pgBufferUsage.file_cache.hits += 1; Assert(entry->access_count > 0); if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->lru_node); @@ -643,7 +666,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void Assert(victim->access_count == 0); entry->offset = victim->offset; /* grab victim's chunk */ hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL); - elog(DEBUG2, "Swap file cache page"); + neon_log(DEBUG2, "Swap file cache page"); } else { @@ -846,10 +869,10 @@ local_cache_pages(PG_FUNCTION_ARGS) * wrong) function definition though. */ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); + neon_log(ERROR, "return type must be a row type"); if (expected_tupledesc->natts != NUM_LOCALCACHE_PAGES_ELEM) - elog(ERROR, "incorrect number of output arguments"); + neon_log(ERROR, "incorrect number of output arguments"); /* Construct a tuple descriptor for the result rows. */ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); @@ -962,3 +985,21 @@ local_cache_pages(PG_FUNCTION_ARGS) else SRF_RETURN_DONE(funcctx); } + +PG_FUNCTION_INFO_V1(approximate_working_set_size); + +Datum +approximate_working_set_size(PG_FUNCTION_ARGS) +{ + int32 dc = -1; + if (lfc_size_limit != 0) + { + bool reset = PG_GETARG_BOOL(0); + LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); + dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation); + if (reset) + memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + LWLockRelease(lfc_lock); + } + PG_RETURN_INT32(dc); +} diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 3a7c0f1bb6..a665cafafe 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/xlog.h" +#include "common/hashfn.h" #include "fmgr.h" #include "libpq-fe.h" #include "libpq/libpq.h" @@ -38,17 +39,6 @@ #define MIN_RECONNECT_INTERVAL_USEC 1000 #define MAX_RECONNECT_INTERVAL_USEC 1000000 -bool connected = false; -PGconn *pageserver_conn = NULL; - -/* - * WaitEventSet containing: - * - WL_SOCKET_READABLE on pageserver_conn, - * - WL_LATCH_SET on MyLatch, and - * - WL_EXIT_ON_PM_DEATH. - */ -WaitEventSet *pageserver_conn_wes = NULL; - /* GUCs */ char *neon_timeline; char *neon_tenant; @@ -59,16 +49,41 @@ char *neon_auth_token; int readahead_buffer_size = 128; int flush_every_n_requests = 8; -static int n_reconnect_attempts = 0; -static int max_reconnect_attempts = 60; +int neon_protocol_version = 2; -#define MAX_PAGESERVER_CONNSTRING_SIZE 256 +static int max_reconnect_attempts = 60; +static int stripe_size; typedef struct { - LWLockId lock; - pg_atomic_uint64 update_counter; - char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; + char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE]; + size_t num_shards; +} ShardMap; + +/* + * PagestoreShmemState is kept in shared memory. It contains the connection + * strings for each shard. + * + * The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option, + * allowing it to be changed using pg_reload_conf(). The control plane can + * update the connection string if the pageserver crashes, is relocated, or + * new shards are added. A parsed copy of the current value of the GUC is kept + * in shared memory, updated by the postmaster, because regular backends don't + * reload the config during query execution, but we might need to re-establish + * the pageserver connection with the new connection string even in the middle + * of a query. + * + * The shared memory copy is protected by a lockless algorithm using two + * atomic counters. The counters allow a backend to quickly check if the value + * has changed since last access, and to detect and retry copying the value if + * the postmaster changes the value concurrently. (Postmaster doesn't have a + * PGPROC entry and therefore cannot use LWLocks.) + */ +typedef struct +{ + pg_atomic_uint64 begin_update_counter; + pg_atomic_uint64 end_update_counter; + ShardMap shard_map; } PagestoreShmemState; #if PG_VERSION_NUM >= 150000 @@ -78,188 +93,562 @@ static void walproposer_shmem_request(void); static shmem_startup_hook_type prev_shmem_startup_hook; static PagestoreShmemState *pagestore_shared; static uint64 pagestore_local_counter = 0; -static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE]; -static bool pageserver_flush(void); -static void pageserver_disconnect(void); +typedef enum PSConnectionState { + PS_Disconnected, /* no connection yet */ + PS_Connecting_Startup, /* connection starting up */ + PS_Connecting_PageStream, /* negotiating pagestream */ + PS_Connected, /* connected, pagestream established */ +} PSConnectionState; + +/* This backend's per-shard connections */ +typedef struct +{ + TimestampTz last_connect_time; /* read-only debug value */ + TimestampTz last_reconnect_time; + uint32 delay_us; + int n_reconnect_attempts; + + /*--- + * Pageserver connection state, i.e. + * disconnected: conn == NULL, wes == NULL; + * conn_startup: connection initiated, waiting for connection establishing + * conn_ps: PageStream query sent, waiting for confirmation + * connected: PageStream established + */ + PSConnectionState state; + PGconn *conn; + /*--- + * WaitEventSet containing: + * - WL_SOCKET_READABLE on 'conn' + * - WL_LATCH_SET on MyLatch, and + * - WL_EXIT_ON_PM_DEATH. + */ + WaitEventSet *wes_read; +} PageServer; + +static PageServer page_servers[MAX_SHARDS]; + +static bool pageserver_flush(shardno_t shard_no); +static void pageserver_disconnect(shardno_t shard_no); +static void pageserver_disconnect_shard(shardno_t shard_no); static bool -PagestoreShmemIsValid() +PagestoreShmemIsValid(void) { return pagestore_shared && UsedShmemSegAddr; } +/* + * Parse a comma-separated list of connection strings into a ShardMap. + * + * If 'result' is NULL, just checks that the input is valid. If the input is + * not valid, returns false. The contents of *result are undefined in + * that case, and must not be relied on. + */ +static bool +ParseShardMap(const char *connstr, ShardMap *result) +{ + const char *p; + int nshards = 0; + + if (result) + memset(result, 0, sizeof(ShardMap)); + + p = connstr; + nshards = 0; + for (;;) + { + const char *sep; + size_t connstr_len; + + sep = strchr(p, ','); + connstr_len = sep != NULL ? sep - p : strlen(p); + + if (connstr_len == 0 && sep == NULL) + break; /* ignore trailing comma */ + + if (nshards >= MAX_SHARDS) + { + neon_log(LOG, "Too many shards"); + return false; + } + if (connstr_len >= MAX_PAGESERVER_CONNSTRING_SIZE) + { + neon_log(LOG, "Connection string too long"); + return false; + } + if (result) + { + memcpy(result->connstring[nshards], p, connstr_len); + result->connstring[nshards][connstr_len] = '\0'; + } + nshards++; + + if (sep == NULL) + break; + p = sep + 1; + } + if (result) + result->num_shards = nshards; + + return true; +} + static bool CheckPageserverConnstring(char **newval, void **extra, GucSource source) { - return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE; + char *p = *newval; + + return ParseShardMap(p, NULL); } static void AssignPageserverConnstring(const char *newval, void *extra) { - if (!PagestoreShmemIsValid()) + ShardMap shard_map; + + /* + * Only postmaster updates the copy in shared memory. + */ + if (!PagestoreShmemIsValid() || IsUnderPostmaster) return; - LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE); - strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE); - pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1); - LWLockRelease(pagestore_shared->lock); -} -static bool -CheckConnstringUpdated() -{ - if (!PagestoreShmemIsValid()) - return false; - return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter); -} - -static void -ReloadConnstring() -{ - if (!PagestoreShmemIsValid()) - return; - LWLockAcquire(pagestore_shared->lock, LW_SHARED); - strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring)); - pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter); - LWLockRelease(pagestore_shared->lock); -} - -static bool -pageserver_connect(int elevel) -{ - char *query; - int ret; - const char *keywords[3]; - const char *values[3]; - int n; - - static TimestampTz last_connect_time = 0; - static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC; - TimestampTz now; - uint64_t us_since_last_connect; - - Assert(!connected); - - if (CheckConnstringUpdated()) + if (!ParseShardMap(newval, &shard_map)) { - ReloadConnstring(); + /* + * shouldn't happen, because we already checked the value in + * CheckPageserverConnstring + */ + elog(ERROR, "could not parse shard map"); } - now = GetCurrentTimestamp(); - us_since_last_connect = now - last_connect_time; - if (us_since_last_connect < delay_us) + if (memcmp(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)) != 0) { - pg_usleep(delay_us - us_since_last_connect); - delay_us *= 2; - if (delay_us > MAX_RECONNECT_INTERVAL_USEC) - delay_us = MAX_RECONNECT_INTERVAL_USEC; - last_connect_time = GetCurrentTimestamp(); + pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1); + pg_write_barrier(); + memcpy(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)); + pg_write_barrier(); + pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1); } else { - delay_us = MIN_RECONNECT_INTERVAL_USEC; - last_connect_time = now; + /* no change */ } +} + +/* + * Get the current number of shards, and/or the connection string for a + * particular shard from the shard map in shared memory. + * + * If num_shards_p is not NULL, it is set to the current number of shards. + * + * If connstr_p is not NULL, the connection string for 'shard_no' is copied to + * it. It must point to a buffer at least MAX_PAGESERVER_CONNSTRING_SIZE bytes + * long. + * + * As a side-effect, if the shard map in shared memory had changed since the + * last call, terminates all existing connections to all pageservers. + */ +static void +load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p) +{ + uint64 begin_update_counter; + uint64 end_update_counter; + ShardMap *shard_map = &pagestore_shared->shard_map; + shardno_t num_shards; /* - * Connect using the connection string we got from the - * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment - * variable was set, use that as the password. - * - * The connection options are parsed in the order they're given, so when - * we set the password before the connection string, the connection string - * can override the password from the env variable. Seems useful, although - * we don't currently use that capability anywhere. + * Postmaster can update the shared memory values concurrently, in which + * case we would copy a garbled mix of the old and new values. We will + * detect it because the counter's won't match, and retry. But it's + * important that we don't do anything within the retry-loop that would + * depend on the string having valid contents. */ - n = 0; - if (neon_auth_token) + do { - keywords[n] = "password"; - values[n] = neon_auth_token; - n++; + begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter); + end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter); + + num_shards = shard_map->num_shards; + if (connstr_p && shard_no < MAX_SHARDS) + strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE); + pg_memory_barrier(); } - keywords[n] = "dbname"; - values[n] = local_pageserver_connstring; - n++; - keywords[n] = NULL; - values[n] = NULL; - n++; - pageserver_conn = PQconnectdbParams(keywords, values, 1); + while (begin_update_counter != end_update_counter + || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter) + || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter)); - if (PQstatus(pageserver_conn) == CONNECTION_BAD) + if (connstr_p && shard_no >= num_shards) + neon_log(ERROR, "Shard %d is greater or equal than number of shards %d", + shard_no, num_shards); + + /* + * If any of the connection strings changed, reset all connections. + */ + if (pagestore_local_counter != end_update_counter) { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); - - PQfinish(pageserver_conn); - pageserver_conn = NULL; - - ereport(elevel, - (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), - errmsg(NEON_TAG "could not establish connection to pageserver"), - errdetail_internal("%s", msg))); - return false; - } - - query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); - ret = PQsendQuery(pageserver_conn, query); - if (ret != 1) - { - PQfinish(pageserver_conn); - pageserver_conn = NULL; - neon_log(elevel, "could not send pagestream command to pageserver"); - return false; - } - - pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3); - AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET, - MyLatch, NULL); - AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, - NULL, NULL); - AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL); - - while (PQisBusy(pageserver_conn)) - { - WaitEvent event; - - /* Sleep until there's something to do */ - (void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION); - ResetLatch(MyLatch); - - CHECK_FOR_INTERRUPTS(); - - /* Data available in socket? */ - if (event.events & WL_SOCKET_READABLE) + for (shardno_t i = 0; i < MAX_SHARDS; i++) { - if (!PQconsumeInput(pageserver_conn)) + if (page_servers[i].conn) + pageserver_disconnect(i); + } + pagestore_local_counter = end_update_counter; + } + + if (num_shards_p) + *num_shards_p = num_shards; +} + +#define MB (1024*1024) + +shardno_t +get_shard_number(BufferTag *tag) +{ + shardno_t n_shards; + uint32 hash; + + load_shard_map(0, NULL, &n_shards); + +#if PG_MAJORVERSION_NUM < 16 + hash = murmurhash32(tag->rnode.relNode); + hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size)); +#else + hash = murmurhash32(tag->relNumber); + hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size)); +#endif + + return hash % n_shards; +} + +static inline void +CLEANUP_AND_DISCONNECT(PageServer *shard) +{ + if (shard->wes_read) + { + FreeWaitEventSet(shard->wes_read); + shard->wes_read = NULL; + } + if (shard->conn) + { + PQfinish(shard->conn); + shard->conn = NULL; + } + + shard->state = PS_Disconnected; +} + +/* + * Connect to a pageserver, or continue to try to connect if we're yet to + * complete the connection (e.g. due to receiving an earlier cancellation + * during connection start). + * Returns true if successfully connected; false if the connection failed. + * + * Throws errors in unrecoverable situations, or when this backend's query + * is canceled. + */ +static bool +pageserver_connect(shardno_t shard_no, int elevel) +{ + PageServer *shard = &page_servers[shard_no]; + char connstr[MAX_PAGESERVER_CONNSTRING_SIZE]; + + /* + * Get the connection string for this shard. If the shard map has been + * updated since we last looked, this will also disconnect any existing + * pageserver connections as a side effect. + * Note that connstr is used both during connection start, and when we + * log the successful connection. + */ + load_shard_map(shard_no, connstr, NULL); + + switch (shard->state) + { + case PS_Disconnected: + { + const char *keywords[3]; + const char *values[3]; + int n_pgsql_params; + TimestampTz now; + int64 us_since_last_attempt; + + /* Make sure we start with a clean slate */ + CLEANUP_AND_DISCONNECT(shard); + + neon_shard_log(shard_no, DEBUG5, "Connection state: Disconnected"); + + now = GetCurrentTimestamp(); + us_since_last_attempt = (int64) (now - shard->last_reconnect_time); + shard->last_reconnect_time = now; + + /* + * Make sure we don't do exponential backoff with a constant multiplier + * of 0 us, as that doesn't really do much for timeouts... + * + * cf. https://github.com/neondatabase/neon/issues/7897 + */ + if (shard->delay_us == 0) + shard->delay_us = MIN_RECONNECT_INTERVAL_USEC; + + /* + * If we did other tasks between reconnect attempts, then we won't + * need to wait as long as a full delay. + */ + if (us_since_last_attempt < shard->delay_us) + { + pg_usleep(shard->delay_us - us_since_last_attempt); + } + + /* update the delay metric */ + shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); + + /* + * Connect using the connection string we got from the + * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment + * variable was set, use that as the password. + * + * The connection options are parsed in the order they're given, so when + * we set the password before the connection string, the connection string + * can override the password from the env variable. Seems useful, although + * we don't currently use that capability anywhere. + */ + keywords[0] = "dbname"; + values[0] = connstr; + n_pgsql_params = 1; + + if (neon_auth_token) + { + keywords[1] = "password"; + values[1] = neon_auth_token; + n_pgsql_params++; + } + + keywords[n_pgsql_params] = NULL; + values[n_pgsql_params] = NULL; + + shard->conn = PQconnectStartParams(keywords, values, 1); + if (!shard->conn) + { + neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory"); + return false; + } + + shard->state = PS_Connecting_Startup; + /* fallthrough */ + } + case PS_Connecting_Startup: + { + char *pagestream_query; + int ps_send_query_ret; + bool connected = false; + int poll_result = PGRES_POLLING_WRITING; + neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_Startup"); + + do + { + WaitEvent event; + + switch (poll_result) { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); + default: /* unknown/unused states are handled as a failed connection */ + case PGRES_POLLING_FAILED: + { + char *pqerr = PQerrorMessage(shard->conn); + char *msg = NULL; + neon_shard_log(shard_no, DEBUG5, "POLLING_FAILED"); - PQfinish(pageserver_conn); - pageserver_conn = NULL; - FreeWaitEventSet(pageserver_conn_wes); - pageserver_conn_wes = NULL; + if (pqerr) + msg = pchomp(pqerr); - neon_log(elevel, "could not complete handshake with pageserver: %s", - msg); - return false; + CLEANUP_AND_DISCONNECT(shard); + + if (msg) + { + neon_shard_log(shard_no, elevel, + "could not connect to pageserver: %s", + msg); + pfree(msg); + } + else + neon_shard_log(shard_no, elevel, + "could not connect to pageserver"); + + return false; + } + case PGRES_POLLING_READING: + /* Sleep until there's something to do */ + while (true) + { + int rc = WaitLatchOrSocket(MyLatch, + WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE, + PQsocket(shard->conn), + 0, + PG_WAIT_EXTENSION); + elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc); + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + /* query cancellation, backend shutdown */ + CHECK_FOR_INTERRUPTS(); + } + if (rc & WL_SOCKET_READABLE) + break; + } + /* PQconnectPoll() handles the socket polling state updates */ + + break; + case PGRES_POLLING_WRITING: + /* Sleep until there's something to do */ + while (true) + { + int rc = WaitLatchOrSocket(MyLatch, + WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE, + PQsocket(shard->conn), + 0, + PG_WAIT_EXTENSION); + elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc); + if (rc & WL_LATCH_SET) + { + ResetLatch(MyLatch); + /* query cancellation, backend shutdown */ + CHECK_FOR_INTERRUPTS(); + } + if (rc & WL_SOCKET_WRITEABLE) + break; + } + /* PQconnectPoll() handles the socket polling state updates */ + + break; + case PGRES_POLLING_OK: + neon_shard_log(shard_no, DEBUG5, "POLLING_OK"); + connected = true; + break; + } + poll_result = PQconnectPoll(shard->conn); + elog(DEBUG5, "PQconnectPoll=>%d", poll_result); + } + while (!connected); + + /* No more polling needed; connection succeeded */ + shard->last_connect_time = GetCurrentTimestamp(); + + shard->wes_read = CreateWaitEventSet(TopMemoryContext, 3); + AddWaitEventToSet(shard->wes_read, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + AddWaitEventToSet(shard->wes_read, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + AddWaitEventToSet(shard->wes_read, WL_SOCKET_READABLE, PQsocket(shard->conn), NULL, NULL); + + + switch (neon_protocol_version) + { + case 2: + pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline); + break; + case 1: + pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline); + break; + default: + elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version); + } + + if (PQstatus(shard->conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); + + CLEANUP_AND_DISCONNECT(shard); + + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); + return false; + } + + ps_send_query_ret = PQsendQuery(shard->conn, pagestream_query); + pfree(pagestream_query); + if (ps_send_query_ret != 1) + { + CLEANUP_AND_DISCONNECT(shard); + + neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver"); + return false; + } + + shard->state = PS_Connecting_PageStream; + /* fallthrough */ + } + case PS_Connecting_PageStream: + { + neon_shard_log(shard_no, DEBUG5, "Connection state: Connecting_PageStream"); + + if (PQstatus(shard->conn) == CONNECTION_BAD) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); + return false; + } + + while (PQisBusy(shard->conn)) + { + WaitEvent event; + + /* Sleep until there's something to do */ + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + + CHECK_FOR_INTERRUPTS(); + + /* Data available in socket? */ + if (event.events & WL_SOCKET_READABLE) + { + if (!PQconsumeInput(shard->conn)) + { + char *msg = pchomp(PQerrorMessage(shard->conn)); + + CLEANUP_AND_DISCONNECT(shard); + neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s", + msg); + pfree(msg); + return false; + } } } + + shard->state = PS_Connected; + /* fallthrough */ } + case PS_Connected: + /* + * We successfully connected. Future connections to this PageServer + * will do fast retries again, with exponential backoff. + */ + shard->delay_us = MIN_RECONNECT_INTERVAL_USEC; - neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring); - - connected = true; - return true; + neon_shard_log(shard_no, DEBUG5, "Connection state: Connected"); + neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s' with protocol version %d", connstr, neon_protocol_version); + return true; + default: + neon_shard_log(shard_no, ERROR, "libpagestore: invalid connection state %d", shard->state); + } + /* This shouldn't be hit */ + Assert(false); } /* * A wrapper around PQgetCopyData that checks for interrupts while sleeping. */ static int -call_PQgetCopyData(char **buffer) +call_PQgetCopyData(shardno_t shard_no, char **buffer) { int ret; + PGconn *pageserver_conn = page_servers[shard_no].conn; retry: ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); @@ -269,7 +658,7 @@ retry: WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -281,7 +670,7 @@ retry: { char *msg = pchomp(PQerrorMessage(pageserver_conn)); - neon_log(LOG, "could not get response from pageserver: %s", msg); + neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg); pfree(msg); return -1; } @@ -293,49 +682,61 @@ retry: return ret; } - +/* + * Reset prefetch and drop connection to the shard. + * It also drops connection to all other shards involved in prefetch, through + * prefetch_on_ps_disconnect(). + */ static void -pageserver_disconnect(void) +pageserver_disconnect(shardno_t shard_no) { + /* + * If the connection to any pageserver is lost, we throw away the + * whole prefetch queue, even for other pageservers. It should not + * cause big problems, because connection loss is supposed to be a + * rare event. + */ + prefetch_on_ps_disconnect(); + + pageserver_disconnect_shard(shard_no); +} + +/* + * Disconnect from specified shard + */ +static void +pageserver_disconnect_shard(shardno_t shard_no) +{ + PageServer *shard = &page_servers[shard_no]; /* * If anything goes wrong while we were sending a request, it's not clear * what state the connection is in. For example, if we sent the request * but didn't receive a response yet, we might receive the response some * time later after we have already sent a new unrelated request. Close * the connection to avoid getting confused. + * Similarly, even when we're in PS_DISCONNECTED, we may have junk to + * clean up: It is possible that we encountered an error allocating any + * of the wait event sets or the psql connection, or failed when we tried + * to attach wait events to the WaitEventSets. */ - if (connected) - { - neon_log(LOG, "dropping connection to page server due to error"); - PQfinish(pageserver_conn); - pageserver_conn = NULL; - connected = false; + CLEANUP_AND_DISCONNECT(shard); - prefetch_on_ps_disconnect(); - } - if (pageserver_conn_wes != NULL) - { - FreeWaitEventSet(pageserver_conn_wes); - pageserver_conn_wes = NULL; - } + shard->state = PS_Disconnected; } static bool -pageserver_send(NeonRequest *request) +pageserver_send(shardno_t shard_no, NeonRequest *request) { StringInfoData req_buff; - - if (CheckConnstringUpdated()) - { - pageserver_disconnect(); - ReloadConnstring(); - } + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn; /* If the connection was lost for some reason, reconnect */ - if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD) + if (shard->state == PS_Connected && PQstatus(shard->conn) == CONNECTION_BAD) { - neon_log(LOG, "pageserver_send disconnect bad connection"); - pageserver_disconnect(); + neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection"); + pageserver_disconnect(shard_no); + pageserver_conn = NULL; } req_buff = nm_pack_request(request); @@ -349,16 +750,20 @@ pageserver_send(NeonRequest *request) * https://github.com/neondatabase/neon/issues/1138 So try to reestablish * connection in case of failure. */ - if (!connected) + if (shard->state != PS_Connected) { - while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) + while (!pageserver_connect(shard_no, shard->n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR)) { HandleMainLoopInterrupts(); - n_reconnect_attempts += 1; + shard->n_reconnect_attempts += 1; } - n_reconnect_attempts = 0; + shard->n_reconnect_attempts = 0; + } else { + Assert(shard->conn != NULL); } + pageserver_conn = shard->conn; + /* * Send request. * @@ -366,13 +771,17 @@ pageserver_send(NeonRequest *request) * should use async mode and check for interrupts while waiting. In * practice, our requests are small enough to always fit in the output and * TCP buffer. + * + * Note that this also will fail when the connection is in the + * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this + * point, but on the grand scheme of things it's only a small issue. */ if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); - pageserver_disconnect(); - neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg); + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, LOG, "pageserver_send disconnected: failed to send page request (try to reconnect): %s", msg); pfree(msg); pfree(req_buff.data); return false; @@ -384,79 +793,93 @@ pageserver_send(NeonRequest *request) { char *msg = nm_to_string((NeonMessage *) request); - neon_log(PageStoreTrace, "sent request: %s", msg); + neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg); pfree(msg); } + return true; } static NeonResponse * -pageserver_receive(void) +pageserver_receive(shardno_t shard_no) { StringInfoData resp_buff; NeonResponse *resp; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn = shard->conn; + /* read response */ + int rc; - if (!connected) - return NULL; - - PG_TRY(); + if (shard->state != PS_Connected) { - /* read response */ - int rc; + neon_shard_log(shard_no, LOG, + "pageserver_receive: returning NULL for non-connected pageserver connection: 0x%02x", + shard->state); + return NULL; + } - rc = call_PQgetCopyData(&resp_buff.data); - if (rc >= 0) + Assert(pageserver_conn); + + rc = call_PQgetCopyData(shard_no, &resp_buff.data); + if (rc >= 0) + { + /* call_PQgetCopyData handles rc == 0 */ + Assert(rc > 0); + + PG_TRY(); { resp_buff.len = rc; resp_buff.cursor = 0; resp = nm_unpack_response(&resp_buff); PQfreemem(resp_buff.data); - - if (message_level_is_interesting(PageStoreTrace)) - { - char *msg = nm_to_string((NeonMessage *) resp); - - neon_log(PageStoreTrace, "got response: %s", msg); - pfree(msg); - } } - else if (rc == -1) + PG_CATCH(); { - neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn))); - pageserver_disconnect(); - resp = NULL; + neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response"); + pageserver_disconnect(shard_no); + PG_RE_THROW(); } - else if (rc == -2) - { - char *msg = pchomp(PQerrorMessage(pageserver_conn)); + PG_END_TRY(); - pageserver_disconnect(); - neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg); - } - else + if (message_level_is_interesting(PageStoreTrace)) { - pageserver_disconnect(); - neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc); + char *msg = nm_to_string((NeonMessage *) resp); + + neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg); + pfree(msg); } } - PG_CATCH(); + else if (rc == -1) { - neon_log(LOG, "pageserver_receive disconnect due to caught exception"); - pageserver_disconnect(); - PG_RE_THROW(); + neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn))); + pageserver_disconnect(shard_no); + resp = NULL; + } + else if (rc == -2) + { + char *msg = pchomp(PQerrorMessage(pageserver_conn)); + + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg); + } + else + { + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc); } - PG_END_TRY(); return (NeonResponse *) resp; } static bool -pageserver_flush(void) +pageserver_flush(shardno_t shard_no) { - if (!connected) + PGconn *pageserver_conn = page_servers[shard_no].conn; + + if (page_servers[shard_no].state != PS_Connected) { - neon_log(WARNING, "Tried to flush while disconnected"); + neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected"); } else { @@ -464,12 +887,13 @@ pageserver_flush(void) { char *msg = pchomp(PQerrorMessage(pageserver_conn)); - pageserver_disconnect(); - neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg); + pageserver_disconnect(shard_no); + neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg); pfree(msg); return false; } } + return true; } @@ -477,7 +901,8 @@ page_server_api api = { .send = pageserver_send, .flush = pageserver_flush, - .receive = pageserver_receive + .receive = pageserver_receive, + .disconnect = pageserver_disconnect_shard }; static bool @@ -505,8 +930,9 @@ PagestoreShmemInit(void) &found); if (!found) { - pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock); - pg_atomic_init_u64(&pagestore_shared->update_counter, 0); + pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0); + pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0); + memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap)); AssignPageserverConnstring(page_server_connstring, NULL); } LWLockRelease(AddinShmemInitLock); @@ -531,7 +957,6 @@ pagestore_shmem_request(void) #endif RequestAddinShmemSpace(PagestoreShmemSize()); - RequestNamedLWLockTranche("neon_libpagestore", 1); } static void @@ -582,6 +1007,15 @@ pg_init_libpagestore(void) 0, /* no flags required */ check_neon_id, NULL, NULL); + DefineCustomIntVariable("neon.stripe_size", + "sharding stripe size", + NULL, + &stripe_size, + 32768, 1, INT_MAX, + PGC_SIGHUP, + GUC_UNIT_BLOCKS, + NULL, NULL, NULL); + DefineCustomIntVariable("neon.max_cluster_size", "cluster size limit", NULL, @@ -619,6 +1053,16 @@ pg_init_libpagestore(void) PGC_USERSET, 0, /* no flags required */ NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL); + DefineCustomIntVariable("neon.protocol_version", + "Version of compute<->page server protocol", + NULL, + &neon_protocol_version, + 2, /* use protocol version 2 */ + 1, /* min */ + 2, /* max */ + PGC_SU_BACKEND, + 0, /* no flags required */ + NULL, NULL, NULL); relsize_hash_init(); @@ -644,5 +1088,7 @@ pg_init_libpagestore(void) dbsize_hook = neon_dbsize; } + memset(page_servers, 0, sizeof(page_servers)); + lfc_init(); } diff --git a/pgxn/neon/neon--1.1--1.0.sql b/pgxn/neon/neon--1.1--1.0.sql new file mode 100644 index 0000000000..e83e3104e8 --- /dev/null +++ b/pgxn/neon/neon--1.1--1.0.sql @@ -0,0 +1,6 @@ +-- the order of operations is important here +-- because the view depends on the function + +DROP VIEW IF EXISTS neon_lfc_stats CASCADE; + +DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE; diff --git a/pgxn/neon/neon--1.1--1.2.sql b/pgxn/neon/neon--1.1--1.2.sql new file mode 100644 index 0000000000..5818b4ffe5 --- /dev/null +++ b/pgxn/neon/neon--1.1--1.2.sql @@ -0,0 +1,29 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.2'" to load this file. \quit + +-- Create a convenient view similar to pg_stat_database +-- that exposes all lfc stat values in one row. +CREATE OR REPLACE VIEW NEON_STAT_FILE_CACHE AS + WITH lfc_stats AS ( + SELECT + stat_name, + count + FROM neon_get_lfc_stats() AS t(stat_name text, count bigint) + ), + lfc_values AS ( + SELECT + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE NULL END) AS file_cache_misses, + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE NULL END) AS file_cache_hits, + MAX(CASE WHEN stat_name = 'file_cache_used' THEN count ELSE NULL END) AS file_cache_used, + MAX(CASE WHEN stat_name = 'file_cache_writes' THEN count ELSE NULL END) AS file_cache_writes, + -- Calculate the file_cache_hit_ratio within the same CTE for simplicity + CASE + WHEN MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) = 0 THEN NULL + ELSE ROUND((MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END)::DECIMAL / + (MAX(CASE WHEN stat_name = 'file_cache_hits' THEN count ELSE 0 END) + MAX(CASE WHEN stat_name = 'file_cache_misses' THEN count ELSE 0 END))) * 100, 2) + END AS file_cache_hit_ratio + FROM lfc_stats + ) +SELECT file_cache_misses, file_cache_hits, file_cache_used, file_cache_writes, file_cache_hit_ratio from lfc_values; + +-- externalize the view to all users in role pg_monitor +GRANT SELECT ON NEON_STAT_FILE_CACHE TO PG_MONITOR; \ No newline at end of file diff --git a/pgxn/neon/neon--1.2--1.1.sql b/pgxn/neon/neon--1.2--1.1.sql new file mode 100644 index 0000000000..c9f6a40f73 --- /dev/null +++ b/pgxn/neon/neon--1.2--1.1.sql @@ -0,0 +1 @@ +DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE; diff --git a/pgxn/neon/neon--1.2--1.3.sql b/pgxn/neon/neon--1.2--1.3.sql new file mode 100644 index 0000000000..9583008777 --- /dev/null +++ b/pgxn/neon/neon--1.2--1.3.sql @@ -0,0 +1,9 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit + +CREATE FUNCTION approximate_working_set_size(reset bool) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_working_set_size' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor; + diff --git a/pgxn/neon/neon--1.3--1.2.sql b/pgxn/neon/neon--1.3--1.2.sql new file mode 100644 index 0000000000..2733a15c75 --- /dev/null +++ b/pgxn/neon/neon--1.3--1.2.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index b930fdb3ca..276d1542fe 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -11,25 +11,262 @@ #include "postgres.h" #include "fmgr.h" +#include "miscadmin.h" #include "access/xact.h" #include "access/xlog.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "catalog/pg_type.h" +#include "postmaster/bgworker.h" +#include "postmaster/interrupt.h" +#include "replication/logical.h" +#include "replication/slot.h" #include "replication/walsender.h" +#include "storage/procsignal.h" +#include "tcop/tcopprot.h" #include "funcapi.h" #include "access/htup_details.h" #include "utils/pg_lsn.h" #include "utils/guc.h" +#include "utils/wait_event.h" +#include "extension_server.h" #include "neon.h" #include "walproposer.h" #include "pagestore_client.h" #include "control_plane_connector.h" +#include "walsender_hooks.h" PG_MODULE_MAGIC; void _PG_init(void); +static int logical_replication_max_snap_files = 300; +bool primary_is_running = false; + +static void +InitLogicalReplicationMonitor(void) +{ + BackgroundWorker bgw; + + DefineCustomIntVariable( + "neon.logical_replication_max_snap_files", + "Maximum allowed logical replication .snap files", + NULL, + &logical_replication_max_snap_files, + 300, 0, INT_MAX, + PGC_SIGHUP, + 0, + NULL, NULL, NULL); + + memset(&bgw, 0, sizeof(bgw)); + bgw.bgw_flags = BGWORKER_SHMEM_ACCESS; + bgw.bgw_start_time = BgWorkerStart_RecoveryFinished; + snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon"); + snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain"); + snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor"); + snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor"); + bgw.bgw_restart_time = 5; + bgw.bgw_notify_pid = 0; + bgw.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&bgw); +} + +static int +LsnDescComparator(const void *a, const void *b) +{ + XLogRecPtr lsn1 = *((const XLogRecPtr *) a); + XLogRecPtr lsn2 = *((const XLogRecPtr *) b); + + if (lsn1 < lsn2) + return 1; + else if (lsn1 == lsn2) + return 0; + else + return -1; +} + +/* + * Look at .snap files and calculate minimum allowed restart_lsn of slot so that + * next gc would leave not more than logical_replication_max_snap_files; all + * slots having lower restart_lsn should be dropped. + */ +static XLogRecPtr +get_num_snap_files_lsn_threshold(void) +{ + DIR *dirdesc; + struct dirent *de; + char *snap_path = "pg_logical/snapshots/"; + int lsns_allocated = 1024; + int lsns_num = 0; + XLogRecPtr *lsns; + XLogRecPtr cutoff; + + if (logical_replication_max_snap_files < 0) + return 0; + + lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated); + + /* find all .snap files and get their lsns */ + dirdesc = AllocateDir(snap_path); + while ((de = ReadDir(dirdesc, snap_path)) != NULL) + { + XLogRecPtr lsn; + uint32 hi; + uint32 lo; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2) + { + ereport(LOG, + (errmsg("could not parse file name as .snap file \"%s\"", de->d_name))); + continue; + } + + lsn = ((uint64) hi) << 32 | lo; + elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn)); + if (lsns_allocated == lsns_num) + { + lsns_allocated *= 2; + lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated); + } + lsns[lsns_num++] = lsn; + } + /* sort by lsn desc */ + qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator); + /* and take cutoff at logical_replication_max_snap_files */ + if (logical_replication_max_snap_files > lsns_num) + cutoff = 0; + /* have less files than cutoff */ + else + { + cutoff = lsns[logical_replication_max_snap_files - 1]; + elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d", + LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files); + } + pfree(lsns); + FreeDir(dirdesc); + return cutoff; +} + +#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */ + +/* + * Unused logical replication slots pins WAL and prevents deletion of snapshots. + * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which + * need too many .snap files. + */ +PGDLLEXPORT void +LogicalSlotsMonitorMain(Datum main_arg) +{ + /* Establish signal handlers. */ + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGHUP, SignalHandlerForConfigReload); + pqsignal(SIGTERM, die); + + BackgroundWorkerUnblockSignals(); + + for (;;) + { + XLogRecPtr cutoff_lsn; + + /* + * If there are too many .snap files, just drop all logical slots to + * prevent aux files bloat. + */ + cutoff_lsn = get_num_snap_files_lsn_threshold(); + if (cutoff_lsn > 0) + { + for (int i = 0; i < max_replication_slots; i++) + { + char slot_name[NAMEDATALEN]; + ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; + XLogRecPtr restart_lsn; + + /* find the name */ + LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); + /* Consider only logical repliction slots */ + if (!s->in_use || !SlotIsLogical(s)) + { + LWLockRelease(ReplicationSlotControlLock); + continue; + } + + /* do we need to drop it? */ + SpinLockAcquire(&s->mutex); + restart_lsn = s->data.restart_lsn; + SpinLockRelease(&s->mutex); + if (restart_lsn >= cutoff_lsn) + { + LWLockRelease(ReplicationSlotControlLock); + continue; + } + + strlcpy(slot_name, s->data.name.data, NAMEDATALEN); + elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X", + slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn)); + LWLockRelease(ReplicationSlotControlLock); + + /* now try to drop it, killing owner before if any */ + for (;;) + { + pid_t active_pid; + + SpinLockAcquire(&s->mutex); + active_pid = s->active_pid; + SpinLockRelease(&s->mutex); + + if (active_pid == 0) + { + /* + * Slot is releasted, try to drop it. Though of course + * it could have been reacquired, so drop can ERROR + * out. Similarly it could have been dropped in the + * meanwhile. + * + * In principle we could remove pg_try/pg_catch, that + * would restart the whole bgworker. + */ + ConditionVariableCancelSleep(); + PG_TRY(); + { + ReplicationSlotDrop(slot_name, true); + elog(LOG, "ls_monitor: slot %s dropped", slot_name); + } + PG_CATCH(); + { + /* log ERROR and reset elog stack */ + EmitErrorReport(); + FlushErrorState(); + elog(LOG, "ls_monitor: failed to drop slot %s", slot_name); + } + PG_END_TRY(); + break; + } + else + { + /* kill the owner and wait for release */ + elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid); + (void) kill(active_pid, SIGTERM); + /* We shouldn't get stuck, but to be safe add timeout. */ + ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP); + } + } + } + } + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, + LS_MONITOR_CHECK_INTERVAL, + PG_WAIT_EXTENSION); + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + } +} + void _PG_init(void) { @@ -43,11 +280,24 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); + WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + + InitLogicalReplicationMonitor(); InitControlPlaneConnector(); pg_init_extension_server(); + DefineCustomBoolVariable( + "neon.primary_is_running", + "true if the primary was running at replica startup. false otherwise", + NULL, + &primary_is_running, + false, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the @@ -65,7 +315,7 @@ pg_cluster_size(PG_FUNCTION_ARGS) { int64 size; - size = GetZenithCurrentClusterSize(); + size = GetNeonCurrentClusterSize(); if (size == 0) PG_RETURN_NULL(); diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index 4e4cb9f372..cee2f336f2 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,5 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -default_version = '1.1' +default_version = '1.3' module_pathname = '$libdir/neon' relocatable = true +trusted = true diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index c3afecc679..5c653fc6c6 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -25,12 +25,13 @@ extern int wal_acceptor_connection_timeout; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); -extern void pg_init_extension_server(void); - extern uint64 BackpressureThrottlingTime(void); +extern void SetNeonCurrentClusterSize(uint64 size); +extern uint64 GetNeonCurrentClusterSize(void); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]); extern void PGDLLEXPORT WalProposerMain(Datum main_arg); +PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg); #endif /* NEON_H */ diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 9135847aaf..1fb4ed9522 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -1,8 +1,12 @@ - #include +#ifndef WALPROPOSER_LIB +#include +#endif + #include "postgres.h" +#include "neon_utils.h" #include "lib/stringinfo.h" #include "libpq/pqformat.h" @@ -11,7 +15,7 @@ * * Returns -1 if the character is not a hexadecimal digit. */ -int +static int HexDecodeChar(char c) { if (c >= '0' && c <= '9') @@ -114,3 +118,48 @@ disable_core_dump() fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno)); } } + +#ifndef WALPROPOSER_LIB + +/* + * On macOS with a libcurl that has IPv6 support, curl_global_init() calls + * SCDynamicStoreCopyProxies(), which makes the program multithreaded. An ideal + * place to call curl_global_init() would be _PG_init(), but Neon has to be + * added to shared_preload_libraries, which are loaded in the Postmaster + * process. The Postmaster is not supposed to become multithreaded at any point + * in its lifecycle. Postgres doesn't have any good hook that I know of to + * initialize per-backend structures, so we have to check this on any + * allocation of a CURL handle. + * + * Free the allocated CURL handle with curl_easy_cleanup(3). + * + * https://developer.apple.com/documentation/systemconfiguration/1517088-scdynamicstorecopyproxies + */ +CURL * +alloc_curl_handle(void) +{ + static bool curl_initialized = false; + + CURL *handle; + + if (unlikely(!curl_initialized)) + { + /* Protected by mutex internally */ + if (curl_global_init(CURL_GLOBAL_DEFAULT)) + { + elog(ERROR, "Failed to initialize curl"); + } + + curl_initialized = true; + } + + handle = curl_easy_init(); + if (handle == NULL) + { + elog(ERROR, "Failed to initialize curl handle"); + } + + return handle; +} + +#endif diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index a86f1e061c..89683714f1 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -1,11 +1,23 @@ #ifndef __NEON_UTILS_H__ #define __NEON_UTILS_H__ +#include "lib/stringinfo.h" + +#ifndef WALPROPOSER_LIB +#include +#endif + bool HexDecodeString(uint8 *result, char *input, int nbytes); uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); void pq_sendint64_le(StringInfo buf, uint64 i); -extern void disable_core_dump(); +void disable_core_dump(void); + +#ifndef WALPROPOSER_LIB + +CURL * alloc_curl_handle(void); + +#endif #endif /* __NEON_UTILS_H__ */ diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c index f7ec9e5bfa..60eb8e1fc9 100644 --- a/pgxn/neon/neon_walreader.c +++ b/pgxn/neon/neon_walreader.c @@ -36,10 +36,7 @@ static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state); -static void NeonWALReaderResetRemote(NeonWALReader *state); static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); -static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); -static void neon_wal_segment_close(NeonWALReader *state); static bool is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli); @@ -82,8 +79,9 @@ struct NeonWALReader XLogRecPtr req_lsn; Size req_len; Size req_progress; - WalProposer *wp; /* we learn donor through walproposer */ + char donor_conninfo[MAXCONNINFO]; char donor_name[64]; /* saved donor safekeeper name for logging */ + XLogRecPtr donor_lsn; /* state of connection to safekeeper */ NeonWALReaderRemoteState rem_state; WalProposerConn *wp_conn; @@ -107,7 +105,7 @@ struct NeonWALReader /* palloc and initialize NeonWALReader */ NeonWALReader * -NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix) +NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix) { NeonWALReader *reader; @@ -123,8 +121,6 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalPropose reader->seg.ws_tli = 0; reader->segcxt.ws_segsize = wal_segment_size; - reader->wp = wp; - reader->rem_state = RS_NONE; if (log_prefix) @@ -188,8 +184,8 @@ NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, Ti } else if (state->wre_errno == ENOENT) { - nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote", - LSN_FORMAT_ARGS(startptr)); + nwr_log(LOG, "local read at %X/%X len %zu failed as segment file doesn't exist, attempting remote", + LSN_FORMAT_ARGS(startptr), count); return NeonWALReadRemote(state, buf, startptr, count, tli); } else @@ -204,21 +200,16 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou { if (state->rem_state == RS_NONE) { - XLogRecPtr donor_lsn; - - /* no connection yet; start one */ - Safekeeper *donor = GetDonor(state->wp, &donor_lsn); - - if (donor == NULL) + if (!NeonWALReaderUpdateDonor(state)) { snprintf(state->err_msg, sizeof(state->err_msg), "failed to establish remote connection to fetch WAL: no donor available"); return NEON_WALREAD_ERROR; + } - snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port); - nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL", - state->donor_name, LSN_FORMAT_ARGS(donor_lsn)); - state->wp_conn = libpqwp_connect_start(donor->conninfo); + /* no connection yet; start one */ + nwr_log(LOG, "establishing connection to %s, lsn=%X/%X to fetch WAL", state->donor_name, LSN_FORMAT_ARGS(state->donor_lsn)); + state->wp_conn = libpqwp_connect_start(state->donor_conninfo); if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD) { snprintf(state->err_msg, sizeof(state->err_msg), @@ -251,10 +242,22 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou { /* connection successfully established */ char start_repl_query[128]; + term_t term = pg_atomic_read_u64(&GetWalpropShmemState()->mineLastElectedTerm); + /* + * Set elected walproposer's term to pull only data from + * its history. Note: for logical walsender it means we + * might stream WAL not yet committed by safekeepers. It + * would be cleaner to fix this. + * + * mineLastElectedTerm shouldn't be 0 at this point + * because we checked above that donor exists and it + * appears only after successfull election. + */ + Assert(term > 0); snprintf(start_repl_query, sizeof(start_repl_query), "START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')", - LSN_FORMAT_ARGS(startptr), state->wp->propTerm); + LSN_FORMAT_ARGS(startptr), term); nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s", state->donor_name, start_repl_query); if (!libpqwp_send_query(state->wp_conn, start_repl_query)) @@ -404,6 +407,10 @@ NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size cou state->req_lsn = InvalidXLogRecPtr; state->req_len = 0; state->req_progress = 0; + + /* Update the current segment info. */ + state->seg.ws_tli = tli; + return NEON_WALREAD_SUCCESS; } } @@ -526,7 +533,7 @@ err: } /* reset remote connection and request in progress */ -static void +void NeonWALReaderResetRemote(NeonWALReader *state) { state->req_lsn = InvalidXLogRecPtr; @@ -607,6 +614,7 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun uint32 startoff; int segbytes; int readbytes; + XLogSegNo lastRemovedSegNo; startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); @@ -682,6 +690,23 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun return false; } + /* + * Recheck that the segment hasn't been removed while we were reading + * it. + */ + lastRemovedSegNo = XLogGetLastRemovedSegno(); + if (state->seg.ws_segno <= lastRemovedSegNo) + { + char fname[MAXFNAMELEN]; + + state->wre_errno = ENOENT; + + XLogFileName(fname, tli, state->seg.ws_segno, state->segcxt.ws_segsize); + snprintf(state->err_msg, sizeof(state->err_msg), "WAL segment %s has been removed during the read, lastRemovedSegNo " UINT64_FORMAT, + fname, lastRemovedSegNo); + return false; + } + /* Update state for read */ recptr += readbytes; nbytes -= readbytes; @@ -691,13 +716,25 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun return true; } +XLogRecPtr +NeonWALReaderGetRemLsn(NeonWALReader *state) +{ + return state->rem_lsn; +} + +const WALOpenSegment * +NeonWALReaderGetSegment(NeonWALReader *state) +{ + return &state->seg; +} + /* * Copy of vanilla wal_segment_open, but returns false in case of error instead * of ERROR, with errno set. * * XLogReaderRoutine->segment_open callback for local pg_wal files */ -static bool +bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p) { @@ -724,7 +761,7 @@ is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli) } /* copy of vanilla wal_segment_close with NeonWALReader */ -static void +void neon_wal_segment_close(NeonWALReader *state) { if (state->seg.ws_file >= 0) @@ -740,3 +777,19 @@ NeonWALReaderErrMsg(NeonWALReader *state) { return state->err_msg; } + +/* + * Returns true if there is a donor, and false otherwise + */ +bool +NeonWALReaderUpdateDonor(NeonWALReader *state) +{ + WalproposerShmemState *wps = GetWalpropShmemState(); + + SpinLockAcquire(&wps->mutex); + memcpy(state->donor_name, wps->donor_name, sizeof(state->donor_name)); + memcpy(state->donor_conninfo, wps->donor_conninfo, sizeof(state->donor_conninfo)); + state->donor_lsn = wps->donor_lsn; + SpinLockRelease(&wps->mutex); + return state->donor_name[0] != '\0'; +} diff --git a/pgxn/neon/neon_walreader.h b/pgxn/neon/neon_walreader.h index 6be9f149aa..3e41825069 100644 --- a/pgxn/neon/neon_walreader.h +++ b/pgxn/neon/neon_walreader.h @@ -19,12 +19,19 @@ typedef enum NEON_WALREAD_ERROR, } NeonWALReadResult; -extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix); +extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix); extern void NeonWALReaderFree(NeonWALReader *state); +extern void NeonWALReaderResetRemote(NeonWALReader *state); extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli); extern pgsocket NeonWALReaderSocket(NeonWALReader *state); extern uint32 NeonWALReaderEvents(NeonWALReader *state); extern bool NeonWALReaderIsRemConnEstablished(NeonWALReader *state); extern char *NeonWALReaderErrMsg(NeonWALReader *state); +extern XLogRecPtr NeonWALReaderGetRemLsn(NeonWALReader *state); +extern const WALOpenSegment *NeonWALReaderGetSegment(NeonWALReader *state); +extern bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p); +extern void neon_wal_segment_close(NeonWALReader *state); +extern bool NeonWALReaderUpdateDonor(NeonWALReader *state); + #endif /* __NEON_WALREADER_H__ */ diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index 3fcaab0bee..8951e6607b 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -15,14 +15,19 @@ #include "neon_pgversioncompat.h" +#include "access/slru.h" #include "access/xlogdefs.h" #include RELFILEINFO_HDR #include "lib/stringinfo.h" #include "libpq/pqformat.h" #include "storage/block.h" +#include "storage/buf_internals.h" #include "storage/smgr.h" #include "utils/memutils.h" +#define MAX_SHARDS 128 +#define MAX_PAGESERVER_CONNSTRING_SIZE 256 + typedef enum { /* pagestore_client -> pagestore */ @@ -30,6 +35,7 @@ typedef enum T_NeonNblocksRequest, T_NeonGetPageRequest, T_NeonDbSizeRequest, + T_NeonGetSlruSegmentRequest, /* pagestore -> pagestore_client */ T_NeonExistsResponse = 100, @@ -37,6 +43,7 @@ typedef enum T_NeonGetPageResponse, T_NeonErrorResponse, T_NeonDbSizeResponse, + T_NeonGetSlruSegmentResponse, } NeonMessageTag; /* base struct for c-style inheritance */ @@ -51,19 +58,44 @@ typedef struct #define neon_log(tag, fmt, ...) ereport(tag, \ (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) +#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) -/* - * supertype of all the Neon*Request structs below +/* SLRUs downloadable from page server */ +typedef enum { + SLRU_CLOG, + SLRU_MULTIXACT_MEMBERS, + SLRU_MULTIXACT_OFFSETS +} SlruKind; + +/*-- + * supertype of all the Neon*Request structs below. * - * If 'latest' is true, we are requesting the latest page version, and 'lsn' - * is just a hint to the server that we know there are no versions of the page - * (or relation size, for exists/nblocks requests) later than the 'lsn'. + * All requests contain two LSNs: + * + * lsn: request page (or relation size, etc) at this LSN + * not_modified_since: Hint that the page hasn't been modified between + * this LSN and the request LSN (`lsn`). + * + * To request the latest version of a page, you can use MAX_LSN as the request + * LSN. + * + * If you don't know any better, you can always set 'not_modified_since' equal + * to 'lsn', but providing a lower value can speed up processing the request + * in the pageserver, as it doesn't need to wait for the WAL to arrive, and it + * can skip traversing through recent layers which we know to not contain any + * versions for the requested page. + * + * These structs describe the V2 of these requests. The old V1 protocol contained + * just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is + * set to 1, we will convert these to the V1 requests before sending. */ typedef struct { NeonMessageTag tag; - bool latest; /* if true, request latest page version */ - XLogRecPtr lsn; /* request page version @ this LSN */ + XLogRecPtr lsn; + XLogRecPtr not_modified_since; } NeonRequest; typedef struct @@ -94,6 +126,13 @@ typedef struct BlockNumber blkno; } NeonGetPageRequest; +typedef struct +{ + NeonRequest req; + SlruKind kind; + int segno; +} NeonGetSlruSegmentRequest; + /* supertype of all the Neon*Response structs below */ typedef struct { @@ -133,6 +172,14 @@ typedef struct * message */ } NeonErrorResponse; +typedef struct +{ + NeonMessageTag tag; + int n_blocks; + char data[BLCKSZ * SLRU_PAGES_PER_SEGMENT]; +} NeonGetSlruSegmentResponse; + + extern StringInfoData nm_pack_request(NeonRequest *msg); extern NeonResponse *nm_unpack_response(StringInfo s); extern char *nm_to_string(NeonMessage *msg); @@ -141,11 +188,14 @@ extern char *nm_to_string(NeonMessage *msg); * API */ +typedef unsigned shardno_t; + typedef struct { - bool (*send) (NeonRequest *request); - NeonResponse *(*receive) (void); - bool (*flush) (void); + bool (*send) (shardno_t shard_no, NeonRequest * request); + NeonResponse *(*receive) (shardno_t shard_no); + bool (*flush) (shardno_t shard_no); + void (*disconnect) (shardno_t shard_no); } page_server_api; extern void prefetch_on_ps_disconnect(void); @@ -158,6 +208,9 @@ extern int readahead_buffer_size; extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; +extern int neon_protocol_version; + +extern shardno_t get_shard_number(BufferTag* tag); extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); @@ -184,18 +237,50 @@ extern void neon_zeroextend(SMgrRelation reln, ForkNumber forknum, extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +/* + * LSN values associated with each request to the pageserver + */ +typedef struct +{ + /* + * 'request_lsn' is the main value that determines which page version to + * fetch. + */ + XLogRecPtr request_lsn; + + /* + * A hint to the pageserver that the requested page hasn't been modified + * between this LSN and 'request_lsn'. That allows the pageserver to + * return the page faster, without waiting for 'request_lsn' to arrive in + * the pageserver, as long as 'not_modified_since' has arrived. + */ + XLogRecPtr not_modified_since; + + /* + * 'effective_request_lsn' is not included in the request that's sent to + * the pageserver, but is used to keep track of the latest LSN of when the + * request was made. In a standby server, this is always the same as the + * 'request_lsn', but in the primary we use UINT64_MAX as the + * 'request_lsn' to request the latest page version, so we need this + * separate field to remember that latest LSN was when the request was + * made. It's needed to manage prefetch request, to verify if the response + * to a prefetched request is still valid. + */ + XLogRecPtr effective_request_lsn; +} neon_request_lsns; + #if PG_MAJORVERSION_NUM < 16 extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + neon_request_lsns request_lsns, char *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); #else extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer); + neon_request_lsns request_lsns, void *buffer); extern void neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); #endif diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 8888cd89c6..8edaf65639 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -45,6 +45,7 @@ */ #include "postgres.h" +#include "access/parallel.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xlogdefs.h" @@ -93,6 +94,10 @@ static char *hexdump_page(char *page); const int SmgrTrace = DEBUG5; +#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ + neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ + ##__VA_ARGS__) + page_server_api *page_server; /* unlogged relation build states */ @@ -168,10 +173,10 @@ typedef enum PrefetchStatus typedef struct PrefetchRequest { BufferTag buftag; /* must be first entry in the struct */ - XLogRecPtr effective_request_lsn; - XLogRecPtr actual_request_lsn; + neon_request_lsns request_lsns; NeonResponse *response; /* may be null */ PrefetchStatus status; + shardno_t shard_no; uint64 my_ring_index; } PrefetchRequest; @@ -239,10 +244,17 @@ typedef struct PrefetchState * also unused */ /* the buffers */ - prfh_hash *prf_hash; + prfh_hash *prf_hash; + int max_shard_no; + /* Mark shards involved in prefetch */ + uint8 shard_bitmap[(MAX_SHARDS + 7)/8]; PrefetchRequest prf_buffer[]; /* prefetch buffers */ } PrefetchState; +#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7))) +#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7)) +#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7)) + static PrefetchState *MyPState; #define GetPrfSlot(ring_index) ( \ @@ -261,19 +273,18 @@ static PrefetchState *MyPState; ) \ ) -static XLogRecPtr prefetch_lsn = 0; - static bool compact_prefetch_buffers(void); static void consume_prefetch_responses(void); -static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn); +static uint64 prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns); static bool prefetch_read(PrefetchRequest *slot); -static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn); +static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); static bool prefetch_wait_for(uint64 ring_index); static void prefetch_cleanup_trailing_unused(void); static inline void prefetch_set_unused(uint64 ring_index); -static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, - ForkNumber forknum, BlockNumber blkno); +static neon_request_lsns neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno); +static bool neon_prefetch_response_usable(neon_request_lsns request_lsns, + PrefetchRequest *slot); static bool compact_prefetch_buffers(void) @@ -327,10 +338,10 @@ compact_prefetch_buffers(void) Assert(target_slot->status == PRFS_UNUSED); target_slot->buftag = source_slot->buftag; + target_slot->shard_no = source_slot->shard_no; target_slot->status = source_slot->status; target_slot->response = source_slot->response; - target_slot->effective_request_lsn = source_slot->effective_request_lsn; - target_slot->actual_request_lsn = source_slot->actual_request_lsn; + target_slot->request_lsns = source_slot->request_lsns; target_slot->my_ring_index = empty_ring_index; prfh_delete(MyPState->prf_hash, source_slot); @@ -349,7 +360,9 @@ compact_prefetch_buffers(void) }; source_slot->response = NULL; source_slot->my_ring_index = 0; - source_slot->effective_request_lsn = 0; + source_slot->request_lsns = (neon_request_lsns) { + InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr + }; /* update bookkeeping */ n_moved++; @@ -494,12 +507,31 @@ prefetch_cleanup_trailing_unused(void) } } + +static bool +prefetch_flush_requests(void) +{ + for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++) + { + if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no)) + { + if (!page_server->flush(shard_no)) + return false; + BITMAP_CLR(MyPState->shard_bitmap, shard_no); + } + } + MyPState->max_shard_no = 0; + return true; +} + /* * Wait for slot of ring_index to have received its response. * The caller is responsible for making sure the request buffer is flushed. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. + * NOTE: callers should make sure they can handle query cancellations in this + * function's call path. */ static bool prefetch_wait_for(uint64 ring_index) @@ -509,7 +541,7 @@ prefetch_wait_for(uint64 ring_index) if (MyPState->ring_flush <= ring_index && MyPState->ring_unused > MyPState->ring_flush) { - if (!page_server->flush()) + if (!prefetch_flush_requests()) return false; MyPState->ring_flush = MyPState->ring_unused; } @@ -535,6 +567,8 @@ prefetch_wait_for(uint64 ring_index) * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. + * + * NOTE: this does IO, and can get canceled out-of-line. */ static bool prefetch_read(PrefetchRequest *slot) @@ -546,8 +580,16 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long)slot->my_ring_index, (long)MyPState->ring_receive); + old = MemoryContextSwitchTo(MyPState->errctx); - response = (NeonResponse *) page_server->receive(); + response = (NeonResponse *) page_server->receive(slot->shard_no); MemoryContextSwitchTo(old); if (response) { @@ -563,6 +605,11 @@ prefetch_read(PrefetchRequest *slot) } else { + neon_shard_log(slot->shard_no, LOG, + "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", + (long)slot->my_ring_index, + RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)), + slot->buftag.forkNum, slot->buftag.blockNum); return false; } } @@ -577,6 +624,7 @@ void prefetch_on_ps_disconnect(void) { MyPState->ring_flush = MyPState->ring_unused; + while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; @@ -587,10 +635,19 @@ prefetch_on_ps_disconnect(void) Assert(slot->status == PRFS_REQUESTED); Assert(slot->my_ring_index == ring_index); + /* + * Drop connection to all shards which have prefetch requests. + * It is not a problem to call disconnect multiple times on the same connection + * because disconnect implementation in libpagestore.c will check if connection + * is alive and do nothing of connection was already dropped. + */ + page_server->disconnect(slot->shard_no); + /* clean up the request */ slot->status = PRFS_TAG_REMAINS; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; + prefetch_set_unused(ring_index); } } @@ -607,13 +664,12 @@ prefetch_on_ps_disconnect(void) static inline void prefetch_set_unused(uint64 ring_index) { - PrefetchRequest *slot = GetPrfSlot(ring_index); + PrefetchRequest *slot; if (ring_index < MyPState->ring_last) return; /* Should already be unused */ - Assert(MyPState->ring_unused > ring_index); - + slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) return; @@ -650,71 +706,53 @@ prefetch_set_unused(uint64 ring_index) compact_prefetch_buffers(); } +/* + * Send one prefetch request to the pageserver. To wait for the response, call + * prefetch_wait_for(). + */ static void -prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn) +prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) { bool found; + uint64 mySlotNo = slot->my_ring_index; + NeonGetPageRequest request = { .req.tag = T_NeonGetPageRequest, - .req.latest = false, - .req.lsn = 0, + /* lsn and not_modified_since are filled in below */ .rinfo = BufTagGetNRelFileInfo(slot->buftag), .forknum = slot->buftag.forkNum, .blkno = slot->buftag.blockNum, }; - if (force_lsn && force_latest) - { - request.req.lsn = *force_lsn; - request.req.latest = *force_latest; - slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn; - } - else - { - XLogRecPtr lsn = neon_get_request_lsn( - &request.req.latest, - BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, - slot->buftag.blockNum - ); + Assert(mySlotNo == MyPState->ring_unused); - /* - * Note: effective_request_lsn is potentially higher than the - * requested LSN, but still correct: - * - * We know there are no changes between the actual requested LSN and - * the value of effective_request_lsn: If there were, the page would - * have been in cache and evicted between those LSN values, which then - * would have had to result in a larger request LSN for this page. - * - * It is possible that a concurrent backend loads the page, modifies - * it and then evicts it again, but the LSN of that eviction cannot be - * smaller than the current WAL insert/redo pointer, which is already - * larger than this prefetch_lsn. So in any case, that would - * invalidate this cache. - * - * The best LSN to use for effective_request_lsn would be - * XLogCtl->Insert.RedoRecPtr, but that's expensive to access. - */ - slot->actual_request_lsn = request.req.lsn = lsn; - prefetch_lsn = Max(prefetch_lsn, lsn); - slot->effective_request_lsn = prefetch_lsn; - } + if (force_request_lsns) + slot->request_lsns = *force_request_lsns; + else + slot->request_lsns = neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, + slot->buftag.blockNum); + request.req.lsn = slot->request_lsns.request_lsn; + request.req.not_modified_since = slot->request_lsns.not_modified_since; Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_unused); - while (!page_server->send((NeonRequest *) &request)); + while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) + { + Assert(mySlotNo == MyPState->ring_unused); + /* loop */ + } /* update prefetch state */ MyPState->n_requests_inflight += 1; MyPState->n_unused -= 1; MyPState->ring_unused += 1; + BITMAP_SET(MyPState->shard_bitmap, slot->shard_no); + MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no); /* update slot state */ slot->status = PRFS_REQUESTED; - - prfh_insert(MyPState->prf_hash, slot, &found); Assert(!found); } @@ -724,16 +762,16 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force * * Register that we may want the contents of BufferTag in the near future. * - * If force_latest and force_lsn are not NULL, those values are sent to the - * pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure - * to fill in these values manually. + * If force_request_lsns is not NULL, those values are sent to the + * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure + * to calculate the LSNs to send. * * NOTE: this function may indirectly update MyPState->pfs_hash; which * invalidates any active pointers into the hash table. */ static uint64 -prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn) +prefetch_register_buffer(BufferTag tag, neon_request_lsns *force_request_lsns) { uint64 ring_index; PrefetchRequest req; @@ -757,38 +795,18 @@ Retry: Assert(BUFFERTAGS_EQUAL(slot->buftag, tag)); /* - * If we want a specific lsn, we do not accept requests that were made - * with a potentially different LSN. + * If the caller specified a request LSN to use, only accept prefetch + * responses that satisfy that request. */ - if (force_latest && force_lsn) + if (force_request_lsns) { - /* - * if we want the latest version, any effective_request_lsn < - * request lsn is OK - */ - if (*force_latest) + if (!neon_prefetch_response_usable(*force_request_lsns, slot)) { - if (*force_lsn > slot->effective_request_lsn) - { - prefetch_wait_for(ring_index); - prefetch_set_unused(ring_index); - entry = NULL; - } - - } - - /* - * if we don't want the latest version, only accept requests with - * the exact same LSN - */ - else - { - if (*force_lsn != slot->effective_request_lsn) - { - prefetch_wait_for(ring_index); - prefetch_set_unused(ring_index); - entry = NULL; - } + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; } } @@ -851,7 +869,8 @@ Retry: { case PRFS_REQUESTED: Assert(MyPState->ring_receive == cleanup_index); - prefetch_wait_for(cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; prefetch_set_unused(cleanup_index); break; case PRFS_RECEIVED: @@ -880,9 +899,10 @@ Retry: * function reads the buffer tag from the slot. */ slot->buftag = tag; + slot->shard_no = get_shard_number(&tag); slot->my_ring_index = ring_index; - prefetch_do_request(slot, force_latest, force_lsn); + prefetch_do_request(slot, force_request_lsns); Assert(slot->status == PRFS_REQUESTED); Assert(MyPState->ring_last <= ring_index && ring_index < MyPState->ring_unused); @@ -890,7 +910,7 @@ Retry: if (flush_every_n_requests > 0 && MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) { - if (!page_server->flush()) + if (!prefetch_flush_requests()) { /* * Prefetch set is reset in case of error, so we should try to @@ -904,20 +924,73 @@ Retry: return ring_index; } +/* + * Note: this function can get canceled and use a long jump to the next catch + * context. Take care. + */ static NeonResponse * page_server_request(void const *req) { NeonResponse *resp; + BufferTag tag = {0}; + shardno_t shard_no; + + switch (messageTag(req)) + { + case T_NeonExistsRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); + break; + case T_NeonNblocksRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo); + break; + case T_NeonDbSizeRequest: + NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode; + break; + case T_NeonGetPageRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo); + tag.blockNum = ((NeonGetPageRequest *) req)->blkno; + break; + default: + neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); + } + shard_no = get_shard_number(&tag); + + /* + * Current sharding model assumes that all metadata is present only at shard 0. + * We still need to call get_shard_no() to check if shard map is up-to-date. + */ + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || + ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) + { + shard_no = 0; + } do { - while (!page_server->send((NeonRequest *) req) || !page_server->flush()); - MyPState->ring_flush = MyPState->ring_unused; - consume_prefetch_responses(); - resp = page_server->receive(); - } while (resp == NULL); - return resp; + PG_TRY(); + { + while (!page_server->send(shard_no, (NeonRequest *) req) + || !page_server->flush(shard_no)) + { + /* do nothing */ + } + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + } + PG_CATCH(); + { + /* + * Cancellation in this code needs to be handled better at some + * point, but this currently seems fine for now. + */ + page_server->disconnect(shard_no); + PG_RE_THROW(); + } + PG_END_TRY(); + } while (resp == NULL); + + return resp; } @@ -927,8 +1000,52 @@ nm_pack_request(NeonRequest *msg) StringInfoData s; initStringInfo(&s); - pq_sendbyte(&s, msg->tag); + if (neon_protocol_version >= 2) + { + pq_sendbyte(&s, msg->tag); + pq_sendint64(&s, msg->lsn); + pq_sendint64(&s, msg->not_modified_since); + } + else + { + bool latest; + XLogRecPtr lsn; + + /* + * In primary, we always request the latest page version. + */ + if (!RecoveryInProgress()) + { + latest = true; + lsn = msg->not_modified_since; + } + else + { + /* + * In the protocol V1, we cannot represent that we want to read + * page at LSN X, and we know that it hasn't been modified since + * Y. We can either use 'not_modified_lsn' as the request LSN, and + * risk getting an error if that LSN is too old and has already + * fallen out of the pageserver's GC horizon, or we can send + * 'request_lsn', causing the pageserver to possibly wait for the + * recent WAL to arrive unnecessarily. Or something in between. We + * choose to use the old LSN and risk GC errors, because that's + * what we've done historically. + */ + latest = false; + lsn = msg->not_modified_since; + } + + pq_sendbyte(&s, msg->tag); + pq_sendbyte(&s, latest); + pq_sendint64(&s, lsn); + } + + /* + * The rest of the request messages are the same between protocol V1 and + * V2 + */ switch (messageTag(msg)) { /* pagestore_client -> pagestore */ @@ -936,8 +1053,6 @@ nm_pack_request(NeonRequest *msg) { NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -949,8 +1064,6 @@ nm_pack_request(NeonRequest *msg) { NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -962,8 +1075,6 @@ nm_pack_request(NeonRequest *msg) { NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, msg_req->dbNode); break; @@ -972,8 +1083,6 @@ nm_pack_request(NeonRequest *msg) { NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - pq_sendbyte(&s, msg_req->req.latest); - pq_sendint64(&s, msg_req->req.lsn); pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); @@ -983,14 +1092,25 @@ nm_pack_request(NeonRequest *msg) break; } + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + + pq_sendbyte(&s, msg_req->kind); + pq_sendint32(&s, msg_req->segno); + + break; + } + /* pagestore -> pagestore_client. We never need to create these. */ case T_NeonExistsResponse: case T_NeonNblocksResponse: case T_NeonGetPageResponse: case T_NeonErrorResponse: case T_NeonDbSizeResponse: + case T_NeonGetSlruSegmentResponse: default: - elog(ERROR, "unexpected neon message tag 0x%02x", msg->tag); + neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); break; } return s; @@ -1075,6 +1195,20 @@ nm_unpack_response(StringInfo s) break; } + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp; + int n_blocks = pq_getmsgint(s, 4); + msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse)); + msg_resp->tag = tag; + msg_resp->n_blocks = n_blocks; + memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + /* * pagestore_client -> pagestore * @@ -1084,8 +1218,9 @@ nm_unpack_response(StringInfo s) case T_NeonNblocksRequest: case T_NeonGetPageRequest: case T_NeonDbSizeRequest: + case T_NeonGetSlruSegmentRequest: default: - elog(ERROR, "unexpected neon message tag 0x%02x", tag); + neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); break; } @@ -1111,7 +1246,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1124,7 +1259,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1138,7 +1273,7 @@ nm_to_string(NeonMessage *msg) appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } @@ -1149,11 +1284,22 @@ nm_to_string(NeonMessage *msg) appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); - appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); appendStringInfoChar(&s, '}'); break; } + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); + appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); + appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } /* pagestore -> pagestore_client */ case T_NeonExistsResponse: { @@ -1207,6 +1353,17 @@ nm_to_string(NeonMessage *msg) msg_resp->db_size); appendStringInfoChar(&s, '}'); + break; + } + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + break; } @@ -1245,6 +1402,10 @@ PageIsEmptyHeapPage(char *buffer) return memcmp(buffer, empty_page.data, BLCKSZ) == 0; } +/* + * A page is being evicted from the shared buffer cache. Update the + * last-written LSN of the page, and WAL-log it if needed. + */ static void #if PG_MAJORVERSION_NUM < 16 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force) @@ -1253,12 +1414,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co #endif { XLogRecPtr lsn = PageGetLSN((Page) buffer); - - if (ShutdownRequestPending) - return; - /* Don't log any pages if we're not allowed to do so. */ - if (!XLogInsertAllowed()) - return; + bool log_page; /* * Whenever a VM or FSM page is evicted, WAL-log it. FSM and (some) VM @@ -1267,9 +1423,21 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co * correctness, the non-logged updates are not critical. But we want to * have a reasonably up-to-date VM and FSM in the page server. */ - if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress()) + log_page = false; + if (force) + { + Assert(XLogInsertAllowed()); + log_page = true; + } + else if (XLogInsertAllowed() && + !ShutdownRequestPending && + (forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM)) + { + log_page = true; + } + + if (log_page) { - /* FSM is never WAL-logged and we don't care. */ XLogRecPtr recptr; recptr = log_newpage_copy(&InfoFromSMgrRel(reln), forknum, blocknum, @@ -1277,12 +1445,13 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co XLogFlush(recptr); lsn = recptr; ereport(SmgrTrace, - (errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); } - else if (lsn == InvalidXLogRecPtr) + + if (lsn == InvalidXLogRecPtr) { /* * When PostgreSQL extends a relation, it calls smgrextend() with an @@ -1305,7 +1474,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co if (PageIsNew((Page) buffer)) { ereport(SmgrTrace, - (errmsg("Page %u of relation %u/%u/%u.%u is all-zeros", + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is all-zeros", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); @@ -1313,24 +1482,36 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co else if (PageIsEmptyHeapPage((Page) buffer)) { ereport(SmgrTrace, - (errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); } - else + else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM) { - ereport(PANIC, - (errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN", + /* + * Its a bad sign if there is a page with zero LSN in the buffer + * cache in a standby, too. However, PANICing seems like a cure + * worse than the disease, as the damage has likely already been + * done in the primary. So in a standby, make this an assertion, + * and in a release build just LOG the error and soldier on. We + * update the last-written LSN of the page with a conservative + * value in that case, which is the last replayed LSN. + */ + ereport(RecoveryInProgress() ? LOG : PANIC, + (errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum))); + Assert(false); + + lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */ } } else { ereport(SmgrTrace, - (errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X", + (errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X", blocknum, RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, LSN_FORMAT_ARGS(lsn)))); @@ -1411,44 +1592,123 @@ nm_adjust_lsn(XLogRecPtr lsn) /* * Return LSN for requesting pages and number of blocks from page server */ -static XLogRecPtr -neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) +static neon_request_lsns +neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno) { - XLogRecPtr lsn; + XLogRecPtr last_written_lsn; + neon_request_lsns result; + + last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno); + last_written_lsn = nm_adjust_lsn(last_written_lsn); + Assert(last_written_lsn != InvalidXLogRecPtr); if (RecoveryInProgress()) { - /* - * We don't know if WAL has been generated but not yet replayed, so - * we're conservative in our estimates about latest pages. + /*--- + * In broad strokes, a replica always requests the page at the current + * replay LSN. But looking closer, what exactly is the replay LSN? Is + * it the last replayed record, or the record being replayed? And does + * the startup process performing the replay need to do something + * differently than backends running queries? Let's take a closer look + * at the different scenarios: + * + * 1. Startup process reads a page, last_written_lsn is old. + * + * Read the old version of the page. We will apply the WAL record on + * it to bring it up-to-date. + * + * We could read the new version, with the changes from this WAL + * record already applied, to offload the work of replaying the record + * to the pageserver. The pageserver might not have received the WAL + * record yet, though, so a read of the old page version and applying + * the record ourselves is likely faster. Also, the redo function + * might be surprised if the changes have already applied. That's + * normal during crash recovery, but not in hot standby. + * + * 2. Startup process reads a page, last_written_lsn == record we're + * replaying. + * + * Can this happen? There are a few theoretical cases when it might: + * + * A) The redo function reads the same page twice. We had already read + * and applied the changes once, and now we're reading it for the + * second time. That would be a rather silly thing for a redo + * function to do, and I'm not aware of any that would do it. + * + * B) The redo function modifies multiple pages, and it already + * applied the changes to one of the pages, released the lock on + * it, and is now reading a second page. Furthermore, the first + * page was already evicted from the buffer cache, and also from + * the last-written LSN cache, so that the per-relation or global + * last-written LSN was already updated. All the WAL redo functions + * hold the locks on pages that they modify, until all the changes + * have been modified (?), which would make that impossible. + * However, we skip the locking, if the page isn't currently in the + * page cache (see neon_redo_read_buffer_filter below). + * + * Even if the one of the above cases were possible in theory, they + * would also require the pages being modified by the redo function to + * be immediately evicted from the page cache. + * + * So this probably does not happen in practice. But if it does, we + * request the new version, including the changes from the record + * being replayed. That seems like the correct behavior in any case. + * + * 3. Backend process reads a page with old last-written LSN + * + * Nothing special here. Read the old version. + * + * 4. Backend process reads a page with last_written_lsn == record being replayed + * + * This can happen, if the redo function has started to run, and saw + * that the page isn't present in the page cache (see + * neon_redo_read_buffer_filter below). Normally, in a normal + * Postgres server, the redo function would hold a lock on the page, + * so we would get blocked waiting the redo function to release the + * lock. To emulate that, wait for the WAL replay of the record to + * finish. */ - *latest = false; + /* Request the page at the end of the last fully replayed LSN. */ + XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL); - /* - * Get the last written LSN of this page. - */ - lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - lsn = nm_adjust_lsn(lsn); + if (last_written_lsn > replay_lsn) + { + /* GetCurrentReplayRecPtr was introduced in v15 */ +#if PG_VERSION_NUM >= 150000 + Assert(last_written_lsn == GetCurrentReplayRecPtr(NULL)); +#endif - elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); + /* + * Cases 2 and 4. If this is a backend (case 4), the + * neon_read_at_lsn() call later will wait for the WAL record to be + * fully replayed. + */ + result.request_lsn = last_written_lsn; + } + else + { + /* cases 1 and 3 */ + result.request_lsn = replay_lsn; + } + result.not_modified_since = last_written_lsn; + result.effective_request_lsn = result.request_lsn; + Assert(last_written_lsn <= result.request_lsn); + + neon_log(DEBUG1, "neon_get_request_lsns request lsn %X/%X, not_modified_since %X/%X", + LSN_FORMAT_ARGS(result.request_lsn), LSN_FORMAT_ARGS(result.not_modified_since)); } else { XLogRecPtr flushlsn; /* - * Use the latest LSN that was evicted from the buffer cache. Any - * pages modified by later WAL records must still in the buffer cache, - * so our request cannot concern those. + * Use the latest LSN that was evicted from the buffer cache as the + * 'not_modified_since' hint. Any pages modified by later WAL records + * must still in the buffer cache, so our request cannot concern + * those. */ - *latest = true; - lsn = GetLastWrittenLSN(rinfo, forknum, blkno); - Assert(lsn != InvalidXLogRecPtr); - elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ", - (uint32) ((lsn) >> 32), (uint32) (lsn)); - - lsn = nm_adjust_lsn(lsn); + neon_log(DEBUG1, "neon_get_request_lsns GetLastWrittenLSN lsn %X/%X", + LSN_FORMAT_ARGS(last_written_lsn)); /* * Is it possible that the last-written LSN is ahead of last flush @@ -1463,16 +1723,144 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block #else flushlsn = GetFlushRecPtr(); #endif - if (lsn > flushlsn) + if (last_written_lsn > flushlsn) { - elog(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", - (uint32) (lsn >> 32), (uint32) lsn, - (uint32) (flushlsn >> 32), (uint32) flushlsn); - XLogFlush(lsn); + neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X", + LSN_FORMAT_ARGS(last_written_lsn), + LSN_FORMAT_ARGS(flushlsn)); + XLogFlush(last_written_lsn); + flushlsn = last_written_lsn; } + + /* + * Request the very latest version of the page. In principle we + * want to read the page at the current insert LSN, and we could + * use that value in the request. However, there's a corner case + * with pageserver's garbage collection. If the GC horizon is + * set to a very small value, it's possible that by the time + * that the pageserver processes our request, the GC horizon has + * already moved past the LSN we calculate here. Standby servers + * always have that problem as the can always lag behind the + * primary, but for the primary we can avoid it by always + * requesting the latest page, by setting request LSN to + * UINT64_MAX. + * + * Remember the current LSN, however, so that we can later + * correctly determine if the response to the request is still + * valid. The most up-to-date LSN we could use for that purpose + * would be the current insert LSN, but to avoid the overhead of + * looking it up, use 'flushlsn' instead. This relies on the + * assumption that if the page was modified since the last WAL + * flush, it should still be in the buffer cache, and we + * wouldn't be requesting it. + */ + result.request_lsn = UINT64_MAX; + result.not_modified_since = last_written_lsn; + result.effective_request_lsn = flushlsn; } - return lsn; + return result; +} + +/* + * neon_prefetch_response_usable -- Can a new request be satisfied by old one? + * + * This is used to check if the response to a prefetch request can be used to + * satisfy a page read now. + */ +static bool +neon_prefetch_response_usable(neon_request_lsns request_lsns, + PrefetchRequest *slot) +{ + /* sanity check the LSN's on the old and the new request */ + Assert(request_lsns.request_lsn >= request_lsns.not_modified_since); + Assert(request_lsns.effective_request_lsn >= request_lsns.not_modified_since); + Assert(request_lsns.effective_request_lsn <= request_lsns.request_lsn); + Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); + Assert(slot->status != PRFS_UNUSED); + + /* + * The new request's LSN should never be older than the old one. This + * could be an Assert, except that for testing purposes, we do provide an + * interface in neon_test_utils to fetch pages at arbitary LSNs, which + * violates this. + * + * Similarly, the not_modified_since value calculated for a page should + * never move backwards. This assumption is a bit fragile; if we updated + * the last-written cache when we read in a page, for example, then it + * might. But as the code stands, it should not. + * + * (If two backends issue a request at the same time, they might race and + * calculate LSNs "out of order" with each other, but the prefetch queue + * is backend-private at the moment.) + */ + if (request_lsns.effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns.not_modified_since < slot->request_lsns.not_modified_since) + { + ereport(LOG, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "request with unexpected LSN after prefetch"), + errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns.not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); + return false; + } + + /*--- + * Each request to the pageserver has three LSN values associated with it: + * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. + * `not_modified_since` and `request_lsn` are sent to the pageserver, but + * in the primary node, we always use UINT64_MAX as the `request_lsn`, so + * we remember `effective_request_lsn` separately. In a primary, + * `effective_request_lsn` is the last flush WAL position when the request + * was sent to the pageserver. That's logically the LSN that we are + * requesting the page at, but we send UINT64_MAX to the pageserver so + * that if the GC horizon advances past that position, we still get a + * valid response instead of an error. + * + * To determine whether a response to a GetPage request issued earlier is + * still valid to satisfy a new page read, we look at the + * (not_modified_since, effective_request_lsn] range of the request. It is + * effectively a claim that the page has not been modified between those + * LSNs. If the range of the old request in the queue overlaps with the + * new request, we know that the page hasn't been modified in the union of + * the ranges. We can use the response to old request to satisfy the new + * request in that case. For example: + * + * 100 500 + * Old request: +--------+ + * + * 400 800 + * New request: +--------+ + * + * The old request claims that the page was not modified between LSNs 100 + * and 500, and the second claims that it was not modified between 400 and + * 800. Together they mean that the page was not modified between 100 and + * 800. Therefore the response to the old request is also valid for the + * new request. + * + * This logic also holds at the boundary case that the old request's LSN + * matches the new request's not_modified_since LSN exactly: + * + * 100 500 + * Old request: +--------+ + * + * 500 900 + * New request: +--------+ + * + * The response to the old request is the page as it was at LSN 500, and + * the page hasn't been changed in the range (500, 900], therefore the + * response is valid also for the new request. + */ + + /* this follows from the checks above */ + Assert(request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + + return request_lsns.not_modified_since <= slot->request_lsns.effective_request_lsn; } /* @@ -1484,8 +1872,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) bool exists; NeonResponse *resp; BlockNumber n_blocks; - bool latest; - XLogRecPtr request_lsn; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { @@ -1509,7 +1896,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return mdexists(reln, forkNum); default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks)) @@ -1540,14 +1927,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) return false; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO); { NeonExistsRequest request = { .req.tag = T_NeonExistsRequest, - .req.latest = latest, - .req.lsn = request_lsn, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), - .forknum = forkNum}; + .forknum = forkNum + }; resp = page_server_request(&request); } @@ -1561,16 +1949,18 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + errmsg(NEON_TAG "could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: - elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", + T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); } pfree(resp); return exists; @@ -1587,7 +1977,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) switch (reln->smgr_relpersistence) { case 0: - elog(ERROR, "cannot call smgrcreate() on rel with unknown persistence"); + neon_log(ERROR, "cannot call smgrcreate() on rel with unknown persistence"); case RELPERSISTENCE_PERMANENT: break; @@ -1598,10 +1988,10 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo) return; default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - elog(SmgrTrace, "Create relation %u/%u/%u.%u", + neon_log(SmgrTrace, "Create relation %u/%u/%u.%u", RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum); @@ -1696,7 +2086,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, switch (reln->smgr_relpersistence) { case 0: - elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); case RELPERSISTENCE_PERMANENT: break; @@ -1707,7 +2097,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, return; default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } /* @@ -1721,7 +2111,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !IsAutoVacuumWorkerProcess()) { - uint64 current_size = GetZenithCurrentClusterSize(); + uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, @@ -1745,7 +2135,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1); lsn = PageGetLSN((Page) buffer); - elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, blkno, (uint32) (lsn >> 32), (uint32) lsn); @@ -1778,14 +2168,13 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, int nblocks, bool skipFsync) { const PGAlignedBlock buffer = {0}; - BlockNumber curblocknum = blocknum; int remblocks = nblocks; XLogRecPtr lsn = 0; switch (reln->smgr_relpersistence) { case 0: - elog(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); case RELPERSISTENCE_PERMANENT: break; @@ -1796,19 +2185,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, return; default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (max_cluster_size > 0 && reln->smgr_relpersistence == RELPERSISTENCE_PERMANENT && !IsAutoVacuumWorkerProcess()) { - uint64 current_size = GetZenithCurrentClusterSize(); + uint64 current_size = GetNeonCurrentClusterSize(); if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024) ereport(ERROR, (errcode(ERRCODE_DISK_FULL), - errmsg("could not extend file because cluster size limit (%d MB) has been exceeded", + errmsg("could not extend file because project size limit (%d MB) has been exceeded", max_cluster_size), errhint("This limit is defined by neon.max_cluster_size GUC"))); } @@ -1821,7 +2210,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("cannot extend file \"%s\" beyond %u blocks", + errmsg(NEON_TAG "cannot extend file \"%s\" beyond %u blocks", relpath(reln->smgr_rlocator, forkNum), InvalidBlockNumber))); @@ -1882,7 +2271,7 @@ neon_open(SMgrRelation reln) mdopen(reln); /* no work */ - elog(SmgrTrace, "[NEON_SMGR] open noop"); + neon_log(SmgrTrace, "open noop"); } /* @@ -1919,7 +2308,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return mdprefetch(reln, forknum, blocknum); default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum)) @@ -1930,7 +2319,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); - ring_index = prefetch_register_buffer(tag, NULL, NULL); + ring_index = prefetch_register_buffer(tag, NULL); Assert(ring_index < MyPState->ring_unused && MyPState->ring_last <= ring_index); @@ -1964,11 +2353,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, return; default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } /* not implemented */ - elog(SmgrTrace, "[NEON_SMGR] writeback noop"); + neon_log(SmgrTrace, "writeback noop"); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1983,10 +2372,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, void #if PG_MAJORVERSION_NUM < 16 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer) + neon_request_lsns request_lsns, char *buffer) #else neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer) + neon_request_lsns request_lsns, void *buffer) #endif { NeonResponse *resp; @@ -2018,25 +2407,27 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, * value of the LwLsn cache when the entry is not found. */ if (RecoveryInProgress() && !(MyBackendType == B_STARTUP)) - XLogWaitForReplayOf(request_lsn); + XLogWaitForReplayOf(request_lsns.request_lsn); /* * Try to find prefetched page in the list of received pages. */ +Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); if (entry != NULL) { slot = entry->slot; - if (slot->effective_request_lsn >= request_lsn) + if (neon_prefetch_response_usable(request_lsns, slot)) { ring_index = slot->my_ring_index; pgBufferUsage.prefetch.hits += 1; } - else /* the current prefetch LSN is not large - * enough, so drop the prefetch */ + else { /* + * Cannot use this prefetch, discard it + * * We can't drop cache for not-yet-received requested items. It is * unlikely this happens, but it can happen if prefetch distance * is large enough and a backend didn't consume all prefetch @@ -2044,7 +2435,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, */ if (slot->status == PRFS_REQUESTED) { - prefetch_wait_for(slot->my_ring_index); + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; } /* drop caches */ prefetch_set_unused(slot->my_ring_index); @@ -2060,8 +2452,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { pgBufferUsage.prefetch.misses += 1; - ring_index = prefetch_register_buffer(buftag, &request_latest, - &request_lsn); + ring_index = prefetch_register_buffer(buftag, &request_lsns); slot = GetPrfSlot(ring_index); } else @@ -2098,16 +2489,18 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - blkno, + errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, blkno, RelFileInfoFmt(rinfo), forkNum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: - elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); } /* buffer was used, clean up for later reuse */ @@ -2125,13 +2518,12 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) #endif { - bool latest; - XLogRecPtr request_lsn; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { case 0: - elog(ERROR, "cannot call smgrread() on rel with unknown persistence"); + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); case RELPERSISTENCE_PERMANENT: break; @@ -2142,7 +2534,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } /* Try to read from local file cache */ @@ -2151,8 +2543,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer return; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno); - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno); + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -2170,7 +2562,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer { if (!PageIsNew((Page) pageserver_masked)) { - elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, @@ -2180,7 +2572,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } else if (PageIsNew((Page) buffer)) { - elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", + neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, @@ -2195,7 +2587,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) { - elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, @@ -2214,7 +2606,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) { - elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", + neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, @@ -2294,13 +2686,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo return; default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } neon_wallog_page(reln, forknum, blocknum, buffer, false); lsn = PageGetLSN((Page) buffer); - elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", + neon_log(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, blocknum, (uint32) (lsn >> 32), (uint32) lsn); @@ -2321,13 +2713,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) { NeonResponse *resp; BlockNumber n_blocks; - bool latest; - XLogRecPtr request_lsn; + neon_request_lsns request_lsns; switch (reln->smgr_relpersistence) { case 0: - elog(ERROR, "cannot call smgrnblocks() on rel with unknown persistence"); + neon_log(ERROR, "cannot call smgrnblocks() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: @@ -2338,23 +2729,23 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) return mdnblocks(reln, forknum); default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks)) { - elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", + neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, n_blocks); return n_blocks; } - request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); + request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO); { NeonNblocksRequest request = { .req.tag = T_NeonNblocksRequest, - .req.latest = latest, - .req.lsn = request_lsn, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .rinfo = InfoFromSMgrRel(reln), .forknum = forknum, }; @@ -2371,24 +2762,26 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + errmsg(NEON_TAG "could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", RelFileInfoFmt(InfoFromSMgrRel(reln)), forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: - elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", + T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); } update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); - elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - n_blocks); + neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + n_blocks); pfree(resp); return n_blocks; @@ -2402,16 +2795,15 @@ neon_dbsize(Oid dbNode) { NeonResponse *resp; int64 db_size; - XLogRecPtr request_lsn; - bool latest; + neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; - request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); + request_lsns = neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO); { NeonDbSizeRequest request = { .req.tag = T_NeonDbSizeRequest, - .req.latest = latest, - .req.lsn = request_lsn, + .req.lsn = request_lsns.request_lsn, + .req.not_modified_since = request_lsns.not_modified_since, .dbNode = dbNode, }; @@ -2427,21 +2819,20 @@ neon_dbsize(Oid dbNode) case T_NeonErrorResponse: ereport(ERROR, (errcode(ERRCODE_IO_ERROR), - errmsg("could not read db size of db %u from page server at lsn %X/%08X", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn), + errmsg(NEON_TAG "could not read db size of db %u from page server at lsn %X/%08X", + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), errdetail("page server returned error: %s", ((NeonErrorResponse *) resp)->message))); break; default: - elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag); + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", + T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); } - elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - db_size); + neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); pfree(resp); return db_size; @@ -2458,7 +2849,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) switch (reln->smgr_relpersistence) { case 0: - elog(ERROR, "cannot call smgrtruncate() on rel with unknown persistence"); + neon_log(ERROR, "cannot call smgrtruncate() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: @@ -2470,7 +2861,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) return; default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks); @@ -2484,7 +2875,6 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * the most recently inserted WAL record's LSN. */ lsn = GetXLogInsertRecPtr(); - lsn = nm_adjust_lsn(lsn); /* @@ -2526,7 +2916,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) switch (reln->smgr_relpersistence) { case 0: - elog(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence"); + neon_log(ERROR, "cannot call smgrimmedsync() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: @@ -2538,10 +2928,10 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) return; default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } - elog(SmgrTrace, "[NEON_SMGR] immedsync noop"); + neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -2566,17 +2956,17 @@ neon_start_unlogged_build(SMgrRelation reln) * progress at a time. That's enough for the current usage. */ if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) - elog(ERROR, "unlogged relation build is already in progress"); + neon_log(ERROR, "unlogged relation build is already in progress"); Assert(unlogged_build_rel == NULL); ereport(SmgrTrace, - (errmsg("starting unlogged build of relation %u/%u/%u", + (errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u", RelFileInfoFmt(InfoFromSMgrRel(reln))))); switch (reln->smgr_relpersistence) { case 0: - elog(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence"); + neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence"); break; case RELPERSISTENCE_PERMANENT: @@ -2589,11 +2979,11 @@ neon_start_unlogged_build(SMgrRelation reln) return; default: - elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); } if (smgrnblocks(reln, MAIN_FORKNUM) != 0) - elog(ERROR, "cannot perform unlogged index build, index is not empty "); + neon_log(ERROR, "cannot perform unlogged index build, index is not empty "); unlogged_build_rel = reln; unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; @@ -2602,10 +2992,14 @@ neon_start_unlogged_build(SMgrRelation reln) reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; /* + * Create the local file. In a parallel build, the leader is expected to + * call this first and do it. + * * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ - mdcreate(reln, MAIN_FORKNUM, false); + if (!IsParallelWorker()) + mdcreate(reln, MAIN_FORKNUM, false); } /* @@ -2620,7 +3014,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) Assert(unlogged_build_rel == reln); ereport(SmgrTrace, - (errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u", + (errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u", RelFileInfoFmt(InfoFromSMgrRel(reln))))); if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) @@ -2629,7 +3023,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); - unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; + /* + * In a parallel build, (only) the leader process performs the 2nd + * phase. + */ + if (IsParallelWorker()) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + } + else + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; } /* @@ -2649,7 +3053,7 @@ neon_end_unlogged_build(SMgrRelation reln) Assert(unlogged_build_rel == reln); ereport(SmgrTrace, - (errmsg("ending unlogged build of relation %u/%u/%u", + (errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u", RelFileInfoFmt(InfoFromNInfoB(rinfob))))); if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) @@ -2664,7 +3068,7 @@ neon_end_unlogged_build(SMgrRelation reln) rinfob = InfoBFromSMgrRel(reln); for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", + neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u", RelFileInfoFmt(InfoFromNInfoB(rinfob)), forknum); @@ -2679,6 +3083,99 @@ neon_end_unlogged_build(SMgrRelation reln) unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } +#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0) + +static int +neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) +{ + XLogRecPtr request_lsn, + not_modified_since; + + /* + * Compute a request LSN to use, similar to neon_get_request_lsns() but the + * logic is a bit simpler. + */ + if (RecoveryInProgress()) + { + request_lsn = GetXLogReplayRecPtr(NULL); + if (request_lsn == InvalidXLogRecPtr) + { + /* + * This happens in neon startup, we start up without replaying any + * records. + */ + request_lsn = GetRedoStartLsn(); + } + request_lsn = nm_adjust_lsn(request_lsn); + } + else + request_lsn = UINT64_MAX; + + /* + * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU + * segment has not changed since the basebackup, because in order to + * modify it, we would have had to download it already. And once + * downloaded, we never evict SLRU segments from local disk. + */ + not_modified_since = nm_adjust_lsn(GetRedoStartLsn()); + + SlruKind kind; + + if (STRPREFIX(path, "pg_xact")) + kind = SLRU_CLOG; + else if (STRPREFIX(path, "pg_multixact/members")) + kind = SLRU_MULTIXACT_MEMBERS; + else if (STRPREFIX(path, "pg_multixact/offsets")) + kind = SLRU_MULTIXACT_OFFSETS; + else + return -1; + + NeonResponse *resp; + NeonGetSlruSegmentRequest request = { + .req.tag = T_NeonGetSlruSegmentRequest, + .req.lsn = request_lsn, + .req.not_modified_since = not_modified_since, + + .kind = kind, + .segno = segno + }; + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + do + { + while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no)); + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + } while (resp == NULL); + + switch (resp->tag) + { + case T_NeonGetSlruSegmentResponse: + n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks; + memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ); + break; + + case T_NeonErrorResponse: + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X", + kind, + segno, + LSN_FORMAT_ARGS(request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", + T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); + + return n_blocks; +} + static void AtEOXact_neon(XactEvent event, void *arg) { @@ -2707,7 +3204,7 @@ AtEOXact_neon(XactEvent event, void *arg) unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - (errmsg("unlogged index build was not properly finished")))); + (errmsg(NEON_TAG "unlogged index build was not properly finished")))); } break; } @@ -2737,6 +3234,8 @@ static const struct f_smgr neon_smgr = .smgr_start_unlogged_build = neon_start_unlogged_build, .smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1, .smgr_end_unlogged_build = neon_end_unlogged_build, + + .smgr_read_slru_segment = neon_read_slru_segment, }; const f_smgr * @@ -2765,6 +3264,9 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, { BlockNumber relsize; + /* This is only used in WAL replay */ + Assert(RecoveryInProgress()); + /* Extend the relation if we know its size */ if (get_cached_relsize(rinfo, forknum, &relsize)) { @@ -2783,14 +3285,13 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * This length is later reused when we open the smgr to read the * block, which is fine and expected. */ - NeonResponse *response; NeonNblocksResponse *nbresponse; NeonNblocksRequest request = { .req = (NeonRequest) { - .lsn = end_recptr, - .latest = false, .tag = T_NeonNblocksRequest, + .lsn = end_recptr, + .not_modified_since = end_recptr, }, .rinfo = rinfo, .forknum = forknum, @@ -2806,14 +3307,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, set_cached_relsize(rinfo, forknum, relsize); SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum); - elog(SmgrTrace, "Set length to %d", relsize); + neon_log(SmgrTrace, "Set length to %d", relsize); } } #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4) /* - * TODO: May be it is better to make correspondent fgunctio from freespace.c public? + * TODO: May be it is better to make correspondent function from freespace.c public? */ static BlockNumber get_fsm_physical_block(BlockNumber heapblk) @@ -2886,7 +3387,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) BufferTag tag; uint32 hash; LWLock *partitionLock; - Buffer buffer; + int buf_id; bool no_redo_needed; if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id)) @@ -2894,19 +3395,11 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) #if PG_VERSION_NUM < 150000 if (!XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno)) - elog(PANIC, "failed to locate backup block with ID %d", block_id); + neon_log(PANIC, "failed to locate backup block with ID %d", block_id); #else XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno); #endif - /* - * Out of an abundance of caution, we always run redo on shared catalogs, - * regardless of whether the block is stored in shared buffers. See also - * this function's top comment. - */ - if (!OidIsValid(NInfoGetDbOid(rinfo))) - return false; - CopyNRelFileInfoToBufTag(tag, rinfo); tag.forkNum = forknum; tag.blockNum = blkno; @@ -2920,21 +3413,32 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) */ LWLockAcquire(partitionLock, LW_SHARED); - /* Try to find the relevant buffer */ - buffer = BufTableLookup(&tag, hash); + /* + * Out of an abundance of caution, we always run redo on shared catalogs, + * regardless of whether the block is stored in shared buffers. See also + * this function's top comment. + */ + if (!OidIsValid(NInfoGetDbOid(rinfo))) + { + no_redo_needed = false; + } + else + { + /* Try to find the relevant buffer */ + buf_id = BufTableLookup(&tag, hash); - no_redo_needed = buffer < 0; - - /* In both cases st lwlsn past this WAL record */ - SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); + no_redo_needed = buf_id < 0; + } /* * we don't have the buffer in memory, update lwLsn past this record, also - * evict page fro file cache + * evict page from file cache */ if (no_redo_needed) + { + SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno); lfc_evict(rinfo, forknum, blkno); - + } LWLockRelease(partitionLock); diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c index b13134b5c3..cc7ac2c394 100644 --- a/pgxn/neon/relsize_cache.c +++ b/pgxn/neon/relsize_cache.c @@ -40,11 +40,23 @@ typedef struct { RelTag tag; BlockNumber size; + dlist_node lru_node; /* LRU list node */ } RelSizeEntry; +typedef struct +{ + size_t size; + uint64 hits; + uint64 misses; + uint64 writes; + dlist_head lru; /* double linked list for LRU replacement + * algorithm */ +} RelSizeHashControl; + static HTAB *relsize_hash; static LWLockId relsize_lock; static int relsize_hash_size; +static RelSizeHashControl* relsize_ctl; static shmem_startup_hook_type prev_shmem_startup_hook = NULL; #if PG_VERSION_NUM >= 150000 static shmem_request_hook_type prev_shmem_request_hook = NULL; @@ -52,7 +64,7 @@ static void relsize_shmem_request(void); #endif /* - * Size of a cache entry is 20 bytes. So this default will take about 1.2 MB, + * Size of a cache entry is 36 bytes. So this default will take about 2.3 MB, * which seems reasonable. */ #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024) @@ -61,19 +73,29 @@ static void neon_smgr_shmem_startup(void) { static HASHCTL info; + bool found; if (prev_shmem_startup_hook) prev_shmem_startup_hook(); LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); - relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize"); - info.keysize = sizeof(RelTag); - info.entrysize = sizeof(RelSizeEntry); - relsize_hash = ShmemInitHash("neon_relsize", - relsize_hash_size, relsize_hash_size, - &info, - HASH_ELEM | HASH_BLOBS); - LWLockRelease(AddinShmemInitLock); + relsize_ctl = (RelSizeHashControl *) ShmemInitStruct("relsize_hash", sizeof(RelSizeHashControl), &found); + if (!found) + { + relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize"); + info.keysize = sizeof(RelTag); + info.entrysize = sizeof(RelSizeEntry); + relsize_hash = ShmemInitHash("neon_relsize", + relsize_hash_size, relsize_hash_size, + &info, + HASH_ELEM | HASH_BLOBS); + LWLockRelease(AddinShmemInitLock); + relsize_ctl->size = 0; + relsize_ctl->hits = 0; + relsize_ctl->misses = 0; + relsize_ctl->writes = 0; + dlist_init(&relsize_ctl->lru); + } } bool @@ -93,7 +115,15 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size) if (entry != NULL) { *size = entry->size; + relsize_ctl->hits += 1; found = true; + /* Move entry to the LRU list tail */ + dlist_delete(&entry->lru_node); + dlist_push_tail(&relsize_ctl->lru, &entry->lru_node); + } + else + { + relsize_ctl->misses += 1; } LWLockRelease(relsize_lock); } @@ -107,12 +137,43 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) { RelTag tag; RelSizeEntry *entry; + bool found = false; tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); - entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL); + /* + * This should actually never happen! Below we check if hash is full and delete least recently user item in this case. + * But for further safety we also perform check here. + */ + while ((entry = hash_search(relsize_hash, &tag, HASH_ENTER_NULL, &found)) == NULL) + { + RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru)); + hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL); + Assert(relsize_ctl->size > 0); + relsize_ctl->size -= 1; + } entry->size = size; + if (!found) + { + if (++relsize_ctl->size == relsize_hash_size) + { + /* + * Remove least recently used elment from the hash. + * Hash size after is becomes `relsize_hash_size-1`. + * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter. + */ + RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru)); + hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL); + relsize_ctl->size -= 1; + } + } + else + { + dlist_delete(&entry->lru_node); + } + dlist_push_tail(&relsize_ctl->lru, &entry->lru_node); + relsize_ctl->writes += 1; LWLockRelease(relsize_lock); } } @@ -132,6 +193,21 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size) entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found); if (!found || entry->size < size) entry->size = size; + if (!found) + { + if (++relsize_ctl->size == relsize_hash_size) + { + RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru)); + hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL); + relsize_ctl->size -= 1; + } + } + else + { + dlist_delete(&entry->lru_node); + } + relsize_ctl->writes += 1; + dlist_push_tail(&relsize_ctl->lru, &entry->lru_node); LWLockRelease(relsize_lock); } } @@ -142,11 +218,16 @@ forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum) if (relsize_hash_size > 0) { RelTag tag; - + RelSizeEntry *entry; tag.rinfo = rinfo; tag.forknum = forknum; LWLockAcquire(relsize_lock, LW_EXCLUSIVE); - hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); + entry = hash_search(relsize_hash, &tag, HASH_REMOVE, NULL); + if (entry) + { + dlist_delete(&entry->lru_node); + relsize_ctl->size -= 1; + } LWLockRelease(relsize_lock); } } @@ -191,7 +272,7 @@ relsize_shmem_request(void) if (prev_shmem_request_hook) prev_shmem_request_hook(); - RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); + RequestAddinShmemSpace(sizeof(RelSizeHashControl) + hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry))); RequestNamedLWLockTranche("neon_relsize", 1); } #endif diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 1f7c473e7d..dbc67a24f5 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -70,7 +70,7 @@ static bool SendAppendRequests(Safekeeper *sk); static bool RecvAppendResponses(Safekeeper *sk); static XLogRecPtr CalculateMinFlushLsn(WalProposer *wp); static XLogRecPtr GetAcknowledgedByQuorumWALPosition(WalProposer *wp); -static void HandleSafekeeperResponse(WalProposer *wp); +static void HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk); static bool AsyncRead(Safekeeper *sk, char **buf, int *buf_size); static bool AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg); static bool BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState success_state); @@ -80,7 +80,7 @@ static int CompareLsn(const void *a, const void *b); static char *FormatSafekeeperState(Safekeeper *sk); static void AssertEventsOkForState(uint32 events, Safekeeper *sk); static char *FormatEvents(WalProposer *wp, uint32 events); - +static void UpdateDonorShmem(WalProposer *wp); WalProposer * WalProposerCreate(WalProposerConfig *config, walproposer_api api) @@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk) if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse)) return; - wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port); + wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term); /* Protocol is all good, move to voting. */ sk->state = SS_VOTING; @@ -922,6 +922,8 @@ static void DetermineEpochStartLsn(WalProposer *wp) { TermHistory *dth; + int n_ready = 0; + WalproposerShmemState *walprop_shared; wp->propEpochStartLsn = InvalidXLogRecPtr; wp->donorEpoch = 0; @@ -932,6 +934,8 @@ DetermineEpochStartLsn(WalProposer *wp) { if (wp->safekeeper[i].state == SS_IDLE) { + n_ready++; + if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch || (GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch && wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn)) @@ -958,9 +962,21 @@ DetermineEpochStartLsn(WalProposer *wp) } } + if (n_ready < wp->quorum) + { + /* + * This is a rare case that can be triggered if safekeeper has voted + * and disconnected. In this case, its state will not be SS_IDLE and + * its vote cannot be used, because we clean up `voteResponse` in + * `ShutdownConnection`. + */ + wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready); + } + /* - * If propEpochStartLsn is 0 everywhere, we are bootstrapping -- nothing - * was committed yet. Start streaming then from the basebackup LSN. + * If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are + * bootstrapping and nothing was committed yet. Start streaming then from + * the basebackup LSN. */ if (wp->propEpochStartLsn == InvalidXLogRecPtr && !wp->config->syncSafekeepers) { @@ -971,14 +987,16 @@ DetermineEpochStartLsn(WalProposer *wp) } wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn)); } + pg_atomic_write_u64(&wp->api.get_shmem_state(wp)->propEpochStartLsn, wp->propEpochStartLsn); /* - * If propEpochStartLsn is not 0, at least one msg with WAL was sent to - * some connected safekeeper; it must have carried truncateLsn pointing to - * the first record. + * Safekeepers are setting truncateLsn after timelineStartLsn is known, so + * it should never be zero at this point, if we know timelineStartLsn. + * + * timelineStartLsn can be zero only on the first syncSafekeepers run. */ Assert((wp->truncateLsn != InvalidXLogRecPtr) || - (wp->config->syncSafekeepers && wp->truncateLsn == wp->propEpochStartLsn)); + (wp->config->syncSafekeepers && wp->truncateLsn == wp->timelineStartLsn)); /* * We will be generating WAL since propEpochStartLsn, so we should set @@ -1008,10 +1026,9 @@ DetermineEpochStartLsn(WalProposer *wp) * since which we are going to write according to the consensus. If not, * we must bail out, as clog and other non rel data is inconsistent. */ + walprop_shared = wp->api.get_shmem_state(wp); if (!wp->config->syncSafekeepers) { - WalproposerShmemState *walprop_shared = wp->api.get_shmem_state(wp); - /* * Basebackup LSN always points to the beginning of the record (not * the page), as StartupXLOG most probably wants it this way. @@ -1026,7 +1043,7 @@ DetermineEpochStartLsn(WalProposer *wp) * compute (who could generate WAL) is ok. */ if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == - walprop_shared->mineLastElectedTerm))) + pg_atomic_read_u64(&walprop_shared->mineLastElectedTerm)))) { /* * Panic to restart PG as we need to retake basebackup. @@ -1040,8 +1057,8 @@ DetermineEpochStartLsn(WalProposer *wp) LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp))); } } - walprop_shared->mineLastElectedTerm = wp->propTerm; } + pg_atomic_write_u64(&walprop_shared->mineLastElectedTerm, wp->propTerm); } /* @@ -1091,9 +1108,13 @@ SendProposerElected(Safekeeper *sk) { /* safekeeper is empty or no common point, start from the beginning */ sk->startStreamingAt = wp->propTermHistory.entries[0].lsn; - wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" , - sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); - /* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */ + wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u", + sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries); + + /* + * wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline + * is created manually (test_s3_wal_replay) + */ Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr); } else @@ -1163,6 +1184,12 @@ StartStreaming(Safekeeper *sk) sk->active_state = SS_ACTIVE_SEND; sk->streamingAt = sk->startStreamingAt; + /* + * Donors can only be in SS_ACTIVE state, so we potentially update the + * donor when we switch one to SS_ACTIVE. + */ + UpdateDonorShmem(sk->wp); + /* event set will be updated inside SendMessageToNode */ SendMessageToNode(sk); } @@ -1206,7 +1233,7 @@ PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr begin req->epochStartLsn = wp->propEpochStartLsn; req->beginLsn = beginLsn; req->endLsn = endLsn; - req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp); + req->commitLsn = wp->commitLsn; req->truncateLsn = wp->truncateLsn; req->proposerId = wp->greetRequest.proposerId; } @@ -1391,7 +1418,6 @@ static bool RecvAppendResponses(Safekeeper *sk) { WalProposer *wp = sk->wp; - XLogRecPtr minQuorumLsn; bool readAnything = false; while (true) @@ -1411,6 +1437,8 @@ RecvAppendResponses(Safekeeper *sk) LSN_FORMAT_ARGS(sk->appendResponse.commitLsn), sk->host, sk->port); + readAnything = true; + if (sk->appendResponse.term > wp->propTerm) { /* @@ -1424,34 +1452,28 @@ RecvAppendResponses(Safekeeper *sk) sk->appendResponse.term, wp->propTerm); } - readAnything = true; + HandleSafekeeperResponse(wp, sk); } if (!readAnything) return sk->state == SS_ACTIVE; - HandleSafekeeperResponse(wp); - - /* - * Also send the new commit lsn to all the safekeepers. - */ - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); - if (minQuorumLsn > wp->lastSentCommitLsn) - { - BroadcastAppendRequest(wp); - wp->lastSentCommitLsn = minQuorumLsn; - } - return sk->state == SS_ACTIVE; } +#define psfeedback_log(fmt, key, ...) \ + wp_log(DEBUG2, "ParsePageserverFeedbackMessage: %s " fmt, key, __VA_ARGS__) + /* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ -void -ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *rf) +static void +ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, PageserverFeedback *ps_feedback) { uint8 nkeys; int i; - int32 len; + + /* initialize the struct before parsing */ + memset(ps_feedback, 0, sizeof(PageserverFeedback)); + ps_feedback->present = true; /* get number of custom keys */ nkeys = pq_getmsgbyte(reply_message); @@ -1459,66 +1481,52 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese for (i = 0; i < nkeys; i++) { const char *key = pq_getmsgstring(reply_message); + unsigned int value_len = pq_getmsgint(reply_message, sizeof(int32)); if (strcmp(key, "current_timeline_size") == 0) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->currentClusterSize = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", - rf->currentClusterSize); + Assert(value_len == sizeof(int64)); + ps_feedback->currentClusterSize = pq_getmsgint64(reply_message); + psfeedback_log(UINT64_FORMAT, key, ps_feedback->currentClusterSize); } else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->last_received_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", - LSN_FORMAT_ARGS(rf->last_received_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->last_received_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->last_received_lsn)); } else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->disk_consistent_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->disk_consistent_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->disk_consistent_lsn)); } else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->remote_consistent_lsn = pq_getmsgint64(reply_message); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", - LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); + Assert(value_len == sizeof(int64)); + ps_feedback->remote_consistent_lsn = pq_getmsgint64(reply_message); + psfeedback_log("%X/%X", key, LSN_FORMAT_ARGS(ps_feedback->remote_consistent_lsn)); } else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { - pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - rf->replytime = pq_getmsgint64(reply_message); - { - char *replyTimeStr; - - /* Copy because timestamptz_to_str returns a static buffer */ - replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); - wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", - rf->replytime, replyTimeStr); - - pfree(replyTimeStr); - } + Assert(value_len == sizeof(int64)); + ps_feedback->replytime = pq_getmsgint64(reply_message); + psfeedback_log("%s", key, timestamptz_to_str(ps_feedback->replytime)); + } + else if (strcmp(key, "shard_number") == 0) + { + Assert(value_len == sizeof(uint32)); + ps_feedback->shard_number = pq_getmsgint(reply_message, sizeof(uint32)); + psfeedback_log("%u", key, ps_feedback->shard_number); } else { - len = pq_getmsgint(reply_message, sizeof(int32)); - /* read value length */ - /* * Skip unknown keys to support backward compatibile protocol * changes */ - wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); - pq_getmsgbytes(reply_message, len); + wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, value_len); + pq_getmsgbytes(reply_message, value_len); }; } } @@ -1573,17 +1581,17 @@ GetAcknowledgedByQuorumWALPosition(WalProposer *wp) * none if it doesn't exist. donor_lsn is set to end position of the donor to * the best of our knowledge. */ -Safekeeper * -GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) +static void +UpdateDonorShmem(WalProposer *wp) { - *donor_lsn = InvalidXLogRecPtr; Safekeeper *donor = NULL; int i; + XLogRecPtr donor_lsn = InvalidXLogRecPtr; if (wp->n_votes < wp->quorum) { - wp_log(WARNING, "GetDonor called before elections are won"); - return NULL; + wp_log(WARNING, "UpdateDonorShmem called before elections are won"); + return; } /* @@ -1594,7 +1602,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) if (wp->safekeeper[wp->donor].state >= SS_IDLE) { donor = &wp->safekeeper[wp->donor]; - *donor_lsn = wp->propEpochStartLsn; + donor_lsn = wp->propEpochStartLsn; } /* @@ -1606,23 +1614,45 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn) { Safekeeper *sk = &wp->safekeeper[i]; - if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > *donor_lsn) + if (sk->state == SS_ACTIVE && sk->appendResponse.flushLsn > donor_lsn) { donor = sk; - *donor_lsn = sk->appendResponse.flushLsn; + donor_lsn = sk->appendResponse.flushLsn; } } - return donor; + + if (donor == NULL) + { + wp_log(WARNING, "UpdateDonorShmem didn't find a suitable donor, skipping"); + return; + } + wp->api.update_donor(wp, donor, donor_lsn); } +/* + * Process AppendResponse message from safekeeper. + */ static void -HandleSafekeeperResponse(WalProposer *wp) +HandleSafekeeperResponse(WalProposer *wp, Safekeeper *sk) { - XLogRecPtr minQuorumLsn; XLogRecPtr candidateTruncateLsn; + XLogRecPtr newCommitLsn; - minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp); - wp->api.process_safekeeper_feedback(wp, minQuorumLsn); + newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp); + if (newCommitLsn > wp->commitLsn) + { + wp->commitLsn = newCommitLsn; + /* Send new value to all safekeepers. */ + BroadcastAppendRequest(wp); + } + + /* + * Unlock syncrep waiters, update ps_feedback, CheckGracefulShutdown(). + * The last one will terminate the process if the shutdown is requested + * and WAL is committed by the quorum. BroadcastAppendRequest() should be + * called to notify safekeepers about the new commitLsn. + */ + wp->api.process_safekeeper_feedback(wp, sk); /* * Try to advance truncateLsn -- the last record flushed to all @@ -1635,7 +1665,7 @@ HandleSafekeeperResponse(WalProposer *wp) * can't commit entries from previous term' in Raft); 2) */ candidateTruncateLsn = CalculateMinFlushLsn(wp); - candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn); + candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn); if (candidateTruncateLsn > wp->truncateLsn) { wp->truncateLsn = candidateTruncateLsn; @@ -1798,8 +1828,10 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg) msg->hs.ts = pq_getmsgint64_le(&s); msg->hs.xmin.value = pq_getmsgint64_le(&s); msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); - if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParsePageserverFeedbackMessage(wp, &s, &msg->rf); + if (s.len > s.cursor) + ParsePageserverFeedbackMessage(wp, &s, &msg->ps_feedback); + else + msg->ps_feedback.present = false; pq_getmsgend(&s); return true; } diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 688d8e6e52..41daeb87b9 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -10,6 +10,7 @@ #include "libpqwalproposer.h" #include "neon_walreader.h" +#include "pagestore_client.h" #define SK_MAGIC 0xCafeCeefu #define SK_PROTOCOL_VERSION 2 @@ -269,6 +270,8 @@ typedef struct HotStandbyFeedback typedef struct PageserverFeedback { + /* true if AppendResponse contains this feedback */ + bool present; /* current size of the timeline on pageserver */ uint64 currentClusterSize; /* standby_status_update fields that safekeeper received from pageserver */ @@ -276,14 +279,27 @@ typedef struct PageserverFeedback XLogRecPtr disk_consistent_lsn; XLogRecPtr remote_consistent_lsn; TimestampTz replytime; + uint32 shard_number; } PageserverFeedback; typedef struct WalproposerShmemState { + pg_atomic_uint64 propEpochStartLsn; + char donor_name[64]; + char donor_conninfo[MAXCONNINFO]; + XLogRecPtr donor_lsn; + slock_t mutex; - PageserverFeedback feedback; - term_t mineLastElectedTerm; + pg_atomic_uint64 mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; + pg_atomic_uint64 currentClusterSize; + + /* last feedback from each shard */ + PageserverFeedback shard_ps_feedback[MAX_SHARDS]; + int num_shards; + + /* aggregated feedback with min LSNs across shards */ + PageserverFeedback min_ps_feedback; } WalproposerShmemState; /* @@ -307,12 +323,12 @@ typedef struct AppendResponse /* Feedback received from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ - PageserverFeedback rf; + PageserverFeedback ps_feedback; } AppendResponse; /* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ -#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) +#define APPENDRESPONSE_FIXEDPART_SIZE 56 struct WalProposer; typedef struct WalProposer WalProposer; @@ -454,6 +470,9 @@ typedef struct walproposer_api /* Get pointer to the latest available WAL. */ XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp); + /* Update current donor info in WalProposer Shmem */ + void (*update_donor) (WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn); + /* Get current time. */ TimestampTz (*get_current_timestamp) (WalProposer *wp); @@ -486,6 +505,8 @@ typedef struct walproposer_api * * On success, the data is placed in *buf. It is valid until the next call * to this function. + * + * Returns PG_ASYNC_READ_FAIL on closed connection. */ PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount); @@ -532,6 +553,14 @@ typedef struct walproposer_api * Returns 0 if timeout is reached, 1 if some event happened. Updates * events mask to indicate events and sets sk to the safekeeper which has * an event. + * + * On timeout, events is set to WL_NO_EVENTS. On socket event, events is + * set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is + * closed, events is set to WL_SOCKET_READABLE. + * + * WL_SOCKET_WRITEABLE is usually set only when we need to flush the + * buffer. It can be returned only if caller asked for this event in the + * last *_event_set call. */ int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events); @@ -551,11 +580,11 @@ typedef struct walproposer_api void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn); /* - * Called after every new message from the safekeeper. Used to propagate - * backpressure feedback and to confirm WAL persistence (has been commited - * on the quorum of safekeepers). + * Called after every AppendResponse from the safekeeper. Used to + * propagate backpressure feedback and to confirm WAL persistence (has + * been commited on the quorum of safekeepers). */ - void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn); + void (*process_safekeeper_feedback) (WalProposer *wp, Safekeeper *sk); /* * Write a log message to the internal log processor. This is used only @@ -637,8 +666,8 @@ typedef struct WalProposer /* WAL has been generated up to this point */ XLogRecPtr availableLsn; - /* last commitLsn broadcasted to safekeepers */ - XLogRecPtr lastSentCommitLsn; + /* cached GetAcknowledgedByQuorumWALPosition result */ + XLogRecPtr commitLsn; ProposerGreeting greetRequest; @@ -696,12 +725,14 @@ extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPt extern void WalProposerPoll(WalProposer *wp); extern void WalProposerFree(WalProposer *wp); +extern WalproposerShmemState *GetWalpropShmemState(); + /* * WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to * recreate set from scratch, hence the export. */ extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events); -extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn); +extern TimeLineID walprop_pg_get_timeline_id(void); #define WPEVENT 1337 /* special log level for walproposer internal diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 61a2a54809..da1a6f76f0 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,11 +63,16 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; -static AppendResponse quorumFeedback; static WalproposerShmemState *walprop_shared; static WalProposerConfig walprop_config; static XLogRecPtr sentPtr = InvalidXLogRecPtr; static const walproposer_api walprop_pg; +static volatile sig_atomic_t got_SIGUSR2 = false; +static bool reported_sigusr2 = false; + +static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr; +static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr; +static HotStandbyFeedback agg_hs_feedback; static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); @@ -80,7 +85,6 @@ static void walprop_pg_init_standalone_sync_safekeepers(void); static void walprop_pg_init_walsender(void); static void walprop_pg_init_bgworker(void); static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp); -static TimeLineID walprop_pg_get_timeline_id(void); static void walprop_pg_load_libpqwalreceiver(void); static process_interrupts_callback_t PrevProcessInterruptsCallback; @@ -89,19 +93,18 @@ static shmem_startup_hook_type prev_shmem_startup_hook_type; static shmem_request_hook_type prev_shmem_request_hook = NULL; static void walproposer_shmem_request(void); #endif +static void WalproposerShmemInit_SyncSafekeeper(void); + static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd); static void WalSndLoop(WalProposer *wp); static void XLogBroadcastWalProposer(WalProposer *wp); -static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr); -static void XLogWalPropClose(XLogRecPtr recptr); - static void add_nwr_event_set(Safekeeper *sk, uint32 events); static void update_nwr_event_set(Safekeeper *sk, uint32 events); static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk); -static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp); +static void CheckGracefulShutdown(WalProposer *wp); static void init_walprop_config(bool syncSafekeepers) @@ -129,6 +132,7 @@ WalProposerSync(int argc, char *argv[]) WalProposer *wp; init_walprop_config(true); + WalproposerShmemInit_SyncSafekeeper(); walprop_pg_init_standalone_sync_safekeepers(); walprop_pg_load_libpqwalreceiver(); @@ -274,13 +278,27 @@ WalproposerShmemInit(void) { memset(walprop_shared, 0, WalproposerShmemSize()); SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); + pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); + pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0); } LWLockRelease(AddinShmemInitLock); return found; } +static void +WalproposerShmemInit_SyncSafekeeper(void) +{ + walprop_shared = palloc(WalproposerShmemSize()); + memset(walprop_shared, 0, WalproposerShmemSize()); + SpinLockInit(&walprop_shared->mutex); + pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0); + pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0); + pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0); +} + #define BACK_PRESSURE_DELAY 10000L // 0.01 sec static bool @@ -391,6 +409,13 @@ nwp_shmem_startup_hook(void) WalproposerShmemInit(); } +WalproposerShmemState * +GetWalpropShmemState() +{ + Assert(walprop_shared != NULL); + return walprop_shared; +} + static WalproposerShmemState * walprop_pg_get_shmem_state(WalProposer *wp) { @@ -398,21 +423,59 @@ walprop_pg_get_shmem_state(WalProposer *wp) return walprop_shared; } -void -replication_feedback_set(PageserverFeedback *rf) +/* + * Record new ps_feedback in the array with shards and update min_feedback. + */ +static PageserverFeedback +record_pageserver_feedback(PageserverFeedback *ps_feedback) { + PageserverFeedback min_feedback; + + Assert(ps_feedback->present); + Assert(ps_feedback->shard_number < MAX_SHARDS); + SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); + + /* Update the number of shards */ + if (ps_feedback->shard_number + 1 > walprop_shared->num_shards) + walprop_shared->num_shards = ps_feedback->shard_number + 1; + + /* Update the feedback */ + memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback)); + + /* Calculate min LSNs */ + memcpy(&min_feedback, ps_feedback, sizeof(PageserverFeedback)); + for (int i = 0; i < walprop_shared->num_shards; i++) + { + PageserverFeedback *feedback = &walprop_shared->shard_ps_feedback[i]; + + if (feedback->present) + { + if (min_feedback.last_received_lsn == InvalidXLogRecPtr || feedback->last_received_lsn < min_feedback.last_received_lsn) + min_feedback.last_received_lsn = feedback->last_received_lsn; + + if (min_feedback.disk_consistent_lsn == InvalidXLogRecPtr || feedback->disk_consistent_lsn < min_feedback.disk_consistent_lsn) + min_feedback.disk_consistent_lsn = feedback->disk_consistent_lsn; + + if (min_feedback.remote_consistent_lsn == InvalidXLogRecPtr || feedback->remote_consistent_lsn < min_feedback.remote_consistent_lsn) + min_feedback.remote_consistent_lsn = feedback->remote_consistent_lsn; + } + } + /* Copy min_feedback back to shmem */ + memcpy(&walprop_shared->min_ps_feedback, &min_feedback, sizeof(PageserverFeedback)); + SpinLockRelease(&walprop_shared->mutex); + + return min_feedback; } void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.last_received_lsn; - *flushLsn = walprop_shared->feedback.disk_consistent_lsn; - *applyLsn = walprop_shared->feedback.remote_consistent_lsn; + *writeLsn = walprop_shared->min_ps_feedback.last_received_lsn; + *flushLsn = walprop_shared->min_ps_feedback.disk_consistent_lsn; + *applyLsn = walprop_shared->min_ps_feedback.remote_consistent_lsn; SpinLockRelease(&walprop_shared->mutex); } @@ -492,6 +555,26 @@ walprop_pg_init_standalone_sync_safekeepers(void) BackgroundWorkerUnblockSignals(); } +/* + * We pretend to be a walsender process, and the lifecycle of a walsender is + * slightly different than other procesess. At shutdown, walsender processes + * stay alive until the very end, after the checkpointer has written the + * shutdown checkpoint. When the checkpointer exits, the postmaster sends all + * remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send + * the remaining WAL, and then exit. This ensures that the checkpoint record + * reaches durable storage (in safekeepers), before the server shuts down + * completely. + */ +static void +walprop_sigusr2(SIGNAL_ARGS) +{ + int save_errno = errno; + + got_SIGUSR2 = true; + SetLatch(MyLatch); + errno = save_errno; +} + static void walprop_pg_init_bgworker(void) { @@ -503,6 +586,7 @@ walprop_pg_init_bgworker(void) pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGHUP, SignalHandlerForConfigReload); pqsignal(SIGTERM, die); + pqsignal(SIGUSR2, walprop_sigusr2); BackgroundWorkerUnblockSignals(); @@ -533,7 +617,7 @@ walprop_pg_get_current_timestamp(WalProposer *wp) return GetCurrentTimestamp(); } -static TimeLineID +TimeLineID walprop_pg_get_timeline_id(void) { #if PG_VERSION_NUM >= 150000 @@ -552,6 +636,20 @@ walprop_pg_load_libpqwalreceiver(void) wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly"); } +static void +walprop_pg_update_donor(WalProposer *wp, Safekeeper *donor, XLogRecPtr donor_lsn) +{ + WalproposerShmemState *wps = wp->api.get_shmem_state(wp); + char donor_name[64]; + + pg_snprintf(donor_name, sizeof(donor_name), "%s:%s", donor->host, donor->port); + SpinLockAcquire(&wps->mutex); + memcpy(wps->donor_name, donor_name, sizeof(donor_name)); + memcpy(wps->donor_conninfo, donor->conninfo, sizeof(donor->conninfo)); + wps->donor_lsn = donor_lsn; + SpinLockRelease(&wps->mutex); +} + /* Helper function */ static bool ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking) @@ -652,7 +750,6 @@ walprop_connect_start(Safekeeper *sk) { Assert(sk->conn == NULL); sk->conn = libpqwp_connect_start(sk->conninfo); - } static WalProposerConnectPollStatusType @@ -1026,7 +1123,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) { XLogRecPtr FlushPtr; - TimeLineID currTLI; + __attribute__((unused)) TimeLineID currTLI; #if PG_VERSION_NUM < 150000 if (ThisTimeLineID == 0) @@ -1075,14 +1172,26 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd) #endif /* - * When we first start replication the standby will be behind the primary. - * For some applications, for example synchronous replication, it is - * important to have a clear state for this initial catchup mode, so we - * can trigger actions when we change streaming state later. We may stay - * in this state for a long time, which is exactly why we want to be able - * to monitor whether or not we are still here. + * XXX: Move straight to STOPPING state, skipping the STREAMING state. + * + * This is a bit weird. Normal walsenders stay in STREAMING state, until + * the checkpointer signals them that it is about to start writing the + * shutdown checkpoint. The walsenders acknowledge that they have received + * that signal by switching to STOPPING state. That tells the walsenders + * that they must not write any new WAL. + * + * However, we cannot easily intercept that signal from the checkpointer. + * It's sent by WalSndInitStopping(), using + * SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by + * HandleWalSndInitStopping, which sets a process-local got_STOPPING flag. + * However, that's all private to walsender.c. + * + * We don't need to do anything special upon receiving the signal, the + * walproposer doesn't write any WAL anyway, so we skip the STREAMING + * state and go directly to STOPPING mode. That way, the checkpointer + * won't wait for us. */ - WalSndSetState(WALSNDSTATE_CATCHUP); + WalSndSetState(WALSNDSTATE_STOPPING); /* * Don't allow a request to stream from a future point in WAL that hasn't @@ -1130,9 +1239,6 @@ WalSndLoop(WalProposer *wp) CHECK_FOR_INTERRUPTS(); XLogBroadcastWalProposer(wp); - - if (MyWalSnd->state == WALSNDSTATE_CATCHUP) - WalSndSetState(WALSNDSTATE_STREAMING); WalProposerPoll(wp); } } @@ -1219,250 +1325,17 @@ XLogBroadcastWalProposer(WalProposer *wp) } } -/* Download WAL before basebackup for logical walsenders from sk, if needed */ +/* + Used to download WAL before basebackup for walproposer/logical walsenders. No + longer used, replaced by neon_walreader; but callback still exists because + simulation tests use it. + */ static bool WalProposerRecovery(WalProposer *wp, Safekeeper *sk) { - char *err; - WalReceiverConn *wrconn; - WalRcvStreamOptions options; - char conninfo[MAXCONNINFO]; - TimeLineID timeline; - XLogRecPtr startpos; - XLogRecPtr endpos; - uint64 download_range_mb; - - startpos = GetLogRepRestartLSN(wp); - if (startpos == InvalidXLogRecPtr) - return true; /* recovery not needed */ - endpos = wp->propEpochStartLsn; - - timeline = wp->greetRequest.timeline; - - if (!neon_auth_token) - { - memcpy(conninfo, sk->conninfo, MAXCONNINFO); - } - else - { - int written = 0; - - written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo); - if (written > MAXCONNINFO || written < 0) - wpg_log(FATAL, "could not append password to the safekeeper connection string"); - } - -#if PG_MAJORVERSION_NUM < 16 - wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err); -#else - wrconn = walrcv_connect(conninfo, false, false, "wal_proposer_recovery", &err); -#endif - - if (!wrconn) - { - ereport(WARNING, - (errmsg("could not connect to WAL acceptor %s:%s: %s", - sk->host, sk->port, - err))); - return false; - } - wpg_log(LOG, - "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline " - "%d", - sk->host, sk->port, (uint32) (startpos >> 32), - (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline); - - options.logical = false; - options.startpoint = startpos; - options.slotname = NULL; - options.proto.physical.startpointTLI = timeline; - - if (walrcv_startstreaming(wrconn, &options)) - { - XLogRecPtr rec_start_lsn; - XLogRecPtr rec_end_lsn = 0; - int len; - char *buf; - pgsocket wait_fd = PGINVALID_SOCKET; - - while ((len = walrcv_receive(wrconn, &buf, &wait_fd)) >= 0) - { - if (len == 0) - { - (void) WaitLatchOrSocket( - MyLatch, WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE, wait_fd, - -1, WAIT_EVENT_WAL_RECEIVER_MAIN); - } - else - { - Assert(buf[0] == 'w' || buf[0] == 'k'); - if (buf[0] == 'k') - continue; /* keepalive */ - memcpy(&rec_start_lsn, &buf[XLOG_HDR_START_POS], - sizeof rec_start_lsn); - rec_start_lsn = pg_ntoh64(rec_start_lsn); - rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE; - - /* write WAL to disk */ - XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn); - - ereport(DEBUG1, - (errmsg("Recover message %X/%X length %d", - LSN_FORMAT_ARGS(rec_start_lsn), len))); - if (rec_end_lsn >= endpos) - break; - } - } - ereport(LOG, - (errmsg("end of replication stream at %X/%X: %m", - LSN_FORMAT_ARGS(rec_end_lsn)))); - walrcv_disconnect(wrconn); - - /* failed to receive all WAL till endpos */ - if (rec_end_lsn < endpos) - return false; - } - else - { - ereport(LOG, - (errmsg("primary server contains no more WAL on requested timeline %u LSN %X/%08X", - timeline, (uint32) (startpos >> 32), (uint32) startpos))); - return false; - } - return true; } -/* - * These variables are used similarly to openLogFile/SegNo, - * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID - * corresponding the filename of walpropFile. - */ -static int walpropFile = -1; -static TimeLineID walpropFileTLI = 0; -static XLogSegNo walpropSegNo = 0; - -/* - * Write XLOG data to disk. - */ -static void -XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr) -{ - int startoff; - int byteswritten; - - /* - * Apart from walproposer, basebackup LSN page is also written out by - * postgres itself which writes WAL only in pages, and in basebackup it is - * inherently dummy (only safekeepers have historic WAL). Update WAL - * buffers here to avoid dummy page overwriting correct one we download - * here. Ugly, but alternatives are about the same ugly. We won't need - * that if we switch to on-demand WAL download from safekeepers, without - * writing to disk. - * - * https://github.com/neondatabase/neon/issues/5749 - */ - if (!wp->config->syncSafekeepers) - XLogUpdateWalBuffers(buf, recptr, nbytes); - - while (nbytes > 0) - { - int segbytes; - - /* Close the current segment if it's completed */ - if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) - XLogWalPropClose(recptr); - - if (walpropFile < 0) - { -#if PG_VERSION_NUM >= 150000 - /* FIXME Is it ok to use hardcoded value here? */ - TimeLineID tli = 1; -#else - bool use_existent = true; -#endif - /* Create/use new log file */ - XLByteToSeg(recptr, walpropSegNo, wal_segment_size); -#if PG_VERSION_NUM >= 150000 - walpropFile = XLogFileInit(walpropSegNo, tli); - walpropFileTLI = tli; -#else - walpropFile = XLogFileInit(walpropSegNo, &use_existent, false); - walpropFileTLI = ThisTimeLineID; -#endif - } - - /* Calculate the start offset of the received logs */ - startoff = XLogSegmentOffset(recptr, wal_segment_size); - - if (startoff + nbytes > wal_segment_size) - segbytes = wal_segment_size - startoff; - else - segbytes = nbytes; - - /* OK to write the logs */ - errno = 0; - - byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff); - if (byteswritten <= 0) - { - char xlogfname[MAXFNAMELEN]; - int save_errno; - - /* if write didn't set errno, assume no disk space */ - if (errno == 0) - errno = ENOSPC; - - save_errno = errno; - XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); - errno = save_errno; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write to log segment %s " - "at offset %u, length %lu: %m", - xlogfname, startoff, (unsigned long) segbytes))); - } - - /* Update state for write */ - recptr += byteswritten; - - nbytes -= byteswritten; - buf += byteswritten; - } - - /* - * Close the current segment if it's fully written up in the last cycle of - * the loop. - */ - if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)) - { - XLogWalPropClose(recptr); - } -} - -/* - * Close the current segment. - */ -static void -XLogWalPropClose(XLogRecPtr recptr) -{ - Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size)); - - if (close(walpropFile) != 0) - { - char xlogfname[MAXFNAMELEN]; - - XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size); - - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not close log segment %s: %m", - xlogfname))); - } - - walpropFile = -1; -} - static void walprop_pg_wal_reader_allocate(Safekeeper *sk) { @@ -1470,7 +1343,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk) snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port); Assert(!sk->xlogreader); - sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix); + sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, log_prefix); if (sk->xlogreader == NULL) wpg_log(FATAL, "failed to allocate xlog reader"); } @@ -1745,6 +1618,9 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 { ConditionVariableCancelSleep(); ResetLatch(MyLatch); + + CheckGracefulShutdown(wp); + *events = WL_LATCH_SET; return 1; } @@ -1799,36 +1675,38 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn) } /* - * Choose most advanced PageserverFeedback and set it to *rf. + * Like vanilla walsender, on sigusr2 send all remaining WAL and exit. + * + * Note that unlike sync-safekeepers waiting here is not reliable: we + * don't check that majority of safekeepers received and persisted + * commit_lsn -- only that walproposer reached it (which immediately + * broadcasts new value). Doing that without incurring redundant control + * file syncing would need wp -> sk protocol change. OTOH unlike + * sync-safekeepers which must bump commit_lsn or basebackup will fail, + * this catchup is important only for tests where safekeepers/network + * don't crash on their own. */ static void -GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp) +CheckGracefulShutdown(WalProposer *wp) { - int latest_safekeeper = 0; - XLogRecPtr last_received_lsn = InvalidXLogRecPtr; - - for (int i = 0; i < wp->n_safekeepers; i++) + if (got_SIGUSR2) { - if (wp->safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) + if (!reported_sigusr2) { - latest_safekeeper = i; - last_received_lsn = wp->safekeeper[i].appendResponse.rf.last_received_lsn; + XLogRecPtr flushPtr = walprop_pg_get_flush_rec_ptr(wp); + + wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X", + LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr)); + reported_sigusr2 = true; + } + + if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp)) + { + wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting", + LSN_FORMAT_ARGS(wp->commitLsn)); + proc_exit(0); } } - - rf->currentClusterSize = wp->safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->last_received_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; - rf->disk_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; - rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; - rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime; - - wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", - rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->last_received_lsn), - LSN_FORMAT_ARGS(rf->disk_consistent_lsn), - LSN_FORMAT_ARGS(rf->remote_consistent_lsn), - rf->replytime); } /* @@ -1838,34 +1716,30 @@ static void CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) { hs->ts = 0; - hs->xmin.value = ~0; /* largest unsigned value */ - hs->catalog_xmin.value = ~0; /* largest unsigned value */ + hs->xmin = InvalidFullTransactionId; + hs->catalog_xmin = InvalidFullTransactionId; for (int i = 0; i < wp->n_safekeepers; i++) { - if (wp->safekeeper[i].appendResponse.hs.ts != 0) + + if (wp->safekeeper[i].state == SS_ACTIVE) { HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs; if (FullTransactionIdIsNormal(skhs->xmin) - && FullTransactionIdPrecedes(skhs->xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin))) { hs->xmin = skhs->xmin; hs->ts = skhs->ts; } if (FullTransactionIdIsNormal(skhs->catalog_xmin) - && FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin)) + && (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin))) { hs->catalog_xmin = skhs->catalog_xmin; hs->ts = skhs->ts; } } } - - if (hs->xmin.value == ~0) - hs->xmin = InvalidFullTransactionId; - if (hs->catalog_xmin.value == ~0) - hs->catalog_xmin = InvalidFullTransactionId; } /* @@ -1878,26 +1752,38 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp) * None of that is functional in sync-safekeepers. */ static void -walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) +walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk) { HotStandbyFeedback hsFeedback; - XLogRecPtr oldDiskConsistentLsn; + bool needToAdvanceSlot = false; if (wp->config->syncSafekeepers) return; - oldDiskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; - - /* Get PageserverFeedback fields from the most advanced safekeeper */ - GetLatestNeonFeedback(&quorumFeedback.rf, wp); - replication_feedback_set(&quorumFeedback.rf); - SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); - - if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) + /* handle fresh ps_feedback */ + if (sk->appendResponse.ps_feedback.present) { - if (commitLsn > quorumFeedback.flushLsn) - quorumFeedback.flushLsn = commitLsn; + PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback); + /* Only one main shard sends non-zero currentClusterSize */ + if (sk->appendResponse.ps_feedback.currentClusterSize > 0) + SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize); + + if (min_feedback.disk_consistent_lsn != standby_apply_lsn) + { + standby_apply_lsn = min_feedback.disk_consistent_lsn; + needToAdvanceSlot = true; + } + } + + if (wp->commitLsn > standby_flush_lsn) + { + standby_flush_lsn = wp->commitLsn; + needToAdvanceSlot = true; + } + + if (needToAdvanceSlot) + { /* * Advance the replication slot to commitLsn. WAL before it is * hardened and will be fetched from one of safekeepers by @@ -1906,29 +1792,45 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn) * Also wakes up syncrep waiters. */ ProcessStandbyReply( - /* write_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, - /* flush_lsn - This is what durably stored in WAL service. */ - quorumFeedback.flushLsn, + /* write_lsn - This is what durably stored in safekeepers quorum. */ + standby_flush_lsn, + /* flush_lsn - This is what durably stored in safekeepers quorum. */ + standby_flush_lsn, /* * apply_lsn - This is what processed and durably saved at* * pageserver. */ - quorumFeedback.rf.disk_consistent_lsn, + standby_apply_lsn, walprop_pg_get_current_timestamp(wp), false); } CombineHotStanbyFeedbacks(&hsFeedback, wp); - if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &quorumFeedback.hs, sizeof hsFeedback) != 0) + if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0) { - quorumFeedback.hs = hsFeedback; + FullTransactionId xmin = hsFeedback.xmin; + FullTransactionId catalog_xmin = hsFeedback.catalog_xmin; + FullTransactionId next_xid = ReadNextFullTransactionId(); + /* + * Page server is updating nextXid in checkpoint each 1024 transactions, + * so feedback xmin can be actually larger then nextXid and + * function TransactionIdInRecentPast return false in this case, + * preventing update of slot's xmin. + */ + if (FullTransactionIdPrecedes(next_xid, xmin)) + xmin = next_xid; + if (FullTransactionIdPrecedes(next_xid, catalog_xmin)) + catalog_xmin = next_xid; + agg_hs_feedback = hsFeedback; + elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin)); ProcessStandbyHSFeedback(hsFeedback.ts, - XidFromFullTransactionId(hsFeedback.xmin), - EpochFromFullTransactionId(hsFeedback.xmin), - XidFromFullTransactionId(hsFeedback.catalog_xmin), - EpochFromFullTransactionId(hsFeedback.catalog_xmin)); + XidFromFullTransactionId(xmin), + EpochFromFullTransactionId(xmin), + XidFromFullTransactionId(catalog_xmin), + EpochFromFullTransactionId(catalog_xmin)); } + + CheckGracefulShutdown(wp); } static XLogRecPtr @@ -1949,62 +1851,25 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line) elog(FATAL, "unexpected log_internal message at level %d: %s", level, line); } -static XLogRecPtr -GetLogRepRestartLSN(WalProposer *wp) +void +SetNeonCurrentClusterSize(uint64 size) { - FILE *f; - XLogRecPtr lrRestartLsn = InvalidXLogRecPtr; - - /* We don't need to do anything in syncSafekeepers mode. */ - if (wp->config->syncSafekeepers) - return InvalidXLogRecPtr; - - /* - * If there are active logical replication subscription we need to provide - * enough WAL for their WAL senders based on th position of their - * replication slots. - */ - f = fopen("restart.lsn", "rb"); - if (f != NULL) - { - size_t rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f); - - fclose(f); - if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr) - { - uint64 download_range_mb; - - wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn)); - - /* - * If we need to download more than a max_slot_wal_keep_size, - * don't do it to avoid risk of exploding pg_wal. Logical - * replication won't work until recreated, but at least compute - * would start; this also follows max_slot_wal_keep_size - * semantics. - */ - download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB; - if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb) - { - wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB", - LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb); - return InvalidXLogRecPtr; - } - - /* - * start from the beginning of the segment to fetch page headers - * verifed by XLogReader - */ - lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size); - } - } - return lrRestartLsn; + pg_atomic_write_u64(&walprop_shared->currentClusterSize, size); } +uint64 +GetNeonCurrentClusterSize(void) +{ + return pg_atomic_read_u64(&walprop_shared->currentClusterSize); +} +uint64 GetNeonCurrentClusterSize(void); + + static const walproposer_api walprop_pg = { .get_shmem_state = walprop_pg_get_shmem_state, .start_streaming = walprop_pg_start_streaming, .get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr, + .update_donor = walprop_pg_update_donor, .get_current_timestamp = walprop_pg_get_current_timestamp, .conn_error_message = walprop_error_message, .conn_status = walprop_status, diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c new file mode 100644 index 0000000000..8f8d1dfc01 --- /dev/null +++ b/pgxn/neon/walsender_hooks.c @@ -0,0 +1,197 @@ +/*------------------------------------------------------------------------- + * + * walsender_hooks.c + * + * Implements XLogReaderRoutine in terms of NeonWALReader. Allows for + * fetching WAL from safekeepers, which normal xlogreader can't do. + * + *------------------------------------------------------------------------- + */ +#include "walsender_hooks.h" +#include "postgres.h" +#include "fmgr.h" +#include "access/xlogdefs.h" +#include "replication/walsender.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "miscadmin.h" +#include "utils/wait_event.h" +#include "utils/guc.h" +#include "postmaster/interrupt.h" + +#include "neon_walreader.h" +#include "walproposer.h" + +static NeonWALReader *wal_reader = NULL; + +struct WalSnd; +extern struct WalSnd *MyWalSnd; +extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); +extern bool GetDonorShmem(XLogRecPtr *donor_lsn); +extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); + +static XLogRecPtr +NeonWALReadWaitForWAL(XLogRecPtr loc) +{ + while (!NeonWALReaderUpdateDonor(wal_reader)) + { + pg_usleep(1000); + CHECK_FOR_INTERRUPTS(); + } + + // Walsender sends keepalives and stuff, so better use its normal wait + if (MyWalSnd != NULL) + return WalSndWaitForWal(loc); + + for (;;) + { + XLogRecPtr flush_ptr; + if (!RecoveryInProgress()) +#if PG_VERSION_NUM >= 150000 + flush_ptr = GetFlushRecPtr(NULL); +#else + flush_ptr = GetFlushRecPtr(); +#endif + else + flush_ptr = GetXLogReplayRecPtr(NULL); + + if (loc <= flush_ptr) + return flush_ptr; + + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000); + } +} + +static int +NeonWALPageRead( + XLogReaderState *xlogreader, + XLogRecPtr targetPagePtr, + int reqLen, + XLogRecPtr targetRecPtr, + char *readBuf) +{ + XLogRecPtr rem_lsn; + + /* Wait for flush pointer to advance past our request */ + XLogRecPtr flushptr = NeonWALReadWaitForWAL(targetPagePtr + reqLen); + int count; + + if (flushptr < targetPagePtr + reqLen) + return -1; + + /* Read at most XLOG_BLCKSZ bytes */ + if (targetPagePtr + XLOG_BLCKSZ <= flushptr) + count = XLOG_BLCKSZ; + else + count = flushptr - targetPagePtr; + + /* + * Sometimes walsender requests non-monotonic sequences of WAL. If that's + * the case, we have to reset streaming from remote at the correct + * position. For example, walsender may try to verify the segment header + * when trying to read in the middle of it. + */ + rem_lsn = NeonWALReaderGetRemLsn(wal_reader); + if (rem_lsn != InvalidXLogRecPtr && targetPagePtr != rem_lsn) + { + NeonWALReaderResetRemote(wal_reader); + } + + for (;;) + { + NeonWALReadResult res = NeonWALRead( + wal_reader, + readBuf, + targetPagePtr, + count, + walprop_pg_get_timeline_id()); + + if (res == NEON_WALREAD_SUCCESS) + { + /* + * Setting ws_tli is required by the XLogReaderRoutine, it is used + * for segment name generation in error reports. + * + * ReadPageInternal updates ws_segno after calling cb on its own + * and XLogReaderRoutine description doesn't require it, but + * WALRead sets, let's follow it. + */ + xlogreader->seg.ws_tli = NeonWALReaderGetSegment(wal_reader)->ws_tli; + xlogreader->seg.ws_segno = NeonWALReaderGetSegment(wal_reader)->ws_segno; + + /* + * ws_file doesn't exist in case of remote read, and isn't used by + * xlogreader except by WALRead on which we don't rely anyway. + */ + return count; + } + if (res == NEON_WALREAD_ERROR) + { + elog(ERROR, "[walsender] Failed to read WAL (req_lsn=%X/%X, len=%d): %s", + LSN_FORMAT_ARGS(targetPagePtr), + reqLen, + NeonWALReaderErrMsg(wal_reader)); + return -1; + } + + /* + * Res is WOULDBLOCK, so we wait on the socket, recreating event set + * if necessary + */ + { + + pgsocket sock = NeonWALReaderSocket(wal_reader); + uint32_t reader_events = NeonWALReaderEvents(wal_reader); + long timeout_ms = 1000; + + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); + if (ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + + WaitLatchOrSocket( + MyLatch, + WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events, + sock, + timeout_ms, + WAIT_EVENT_WAL_SENDER_MAIN); + } + } +} + +static void +NeonWALReadSegmentOpen(XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p) +{ + neon_wal_segment_open(wal_reader, nextSegNo, tli_p); + xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; +} + +static void +NeonWALReadSegmentClose(XLogReaderState *xlogreader) +{ + neon_wal_segment_close(wal_reader); + xlogreader->seg.ws_file = NeonWALReaderGetSegment(wal_reader)->ws_file; +} + +void +NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) +{ + if (!wal_reader) + { + XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn); + + if (epochStartLsn == 0) + { + elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!"); + } + wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] "); + } + xlr->page_read = NeonWALPageRead; + xlr->segment_open = NeonWALReadSegmentOpen; + xlr->segment_close = NeonWALReadSegmentClose; +} diff --git a/pgxn/neon/walsender_hooks.h b/pgxn/neon/walsender_hooks.h new file mode 100644 index 0000000000..2e3ce180f9 --- /dev/null +++ b/pgxn/neon/walsender_hooks.h @@ -0,0 +1,7 @@ +#ifndef __WALSENDER_HOOKS_H__ +#define __WALSENDER_HOOKS_H__ + +struct XLogReaderRoutine; +void NeonOnDemandXLogReaderRoutines(struct XLogReaderRoutine *xlr); + +#endif diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile index 9c774ec185..1ee87357e5 100644 --- a/pgxn/neon_test_utils/Makefile +++ b/pgxn/neon_test_utils/Makefile @@ -7,7 +7,7 @@ OBJS = \ neontest.o EXTENSION = neon_test_utils -DATA = neon_test_utils--1.0.sql +DATA = neon_test_utils--1.1.sql PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" PG_CONFIG = pg_config diff --git a/pgxn/neon_test_utils/neon_test_utils--1.0.sql b/pgxn/neon_test_utils/neon_test_utils--1.1.sql similarity index 59% rename from pgxn/neon_test_utils/neon_test_utils--1.0.sql rename to pgxn/neon_test_utils/neon_test_utils--1.1.sql index 402981a9a6..534784f319 100644 --- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql +++ b/pgxn/neon_test_utils/neon_test_utils--1.1.sql @@ -7,18 +7,36 @@ AS 'MODULE_PATHNAME', 'test_consume_xids' LANGUAGE C STRICT PARALLEL UNSAFE; +CREATE FUNCTION test_consume_cpu(seconds int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_cpu' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION test_consume_memory(megabytes int) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_consume_memory' +LANGUAGE C STRICT +PARALLEL UNSAFE; + +CREATE FUNCTION test_release_memory(megabytes int DEFAULT NULL) +RETURNS VOID +AS 'MODULE_PATHNAME', 'test_release_memory' +LANGUAGE C +PARALLEL UNSAFE; + CREATE FUNCTION clear_buffer_cache() RETURNS VOID AS 'MODULE_PATHNAME', 'clear_buffer_cache' LANGUAGE C STRICT PARALLEL UNSAFE; -CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn) +CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn' LANGUAGE C PARALLEL UNSAFE; -CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn) +CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, request_lsn pg_lsn, not_modified_since pg_lsn) RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' LANGUAGE C PARALLEL UNSAFE; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index 94e6720503..5f6d640835 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -1,5 +1,6 @@ # neon_test_utils extension comment = 'helpers for neon testing and debugging' -default_version = '1.0' +default_version = '1.1' module_pathname = '$libdir/neon_test_utils' relocatable = true +trusted = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index aa644efd40..47f245fbf1 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -21,10 +21,12 @@ #include "miscadmin.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/fd.h" #include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/rel.h" #include "utils/varlena.h" +#include "utils/wait_event.h" #include "../neon/pagestore_client.h" PG_MODULE_MAGIC; @@ -32,6 +34,9 @@ PG_MODULE_MAGIC; extern void _PG_init(void); PG_FUNCTION_INFO_V1(test_consume_xids); +PG_FUNCTION_INFO_V1(test_consume_cpu); +PG_FUNCTION_INFO_V1(test_consume_memory); +PG_FUNCTION_INFO_V1(test_release_memory); PG_FUNCTION_INFO_V1(clear_buffer_cache); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); @@ -43,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush); */ #if PG_MAJORVERSION_NUM < 16 typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, char *buffer); + neon_request_lsns request_lsns, char *buffer); #else typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - XLogRecPtr request_lsn, bool request_latest, void *buffer); + neon_request_lsns request_lsns, void *buffer); #endif static neon_read_at_lsn_type neon_read_at_lsn_ptr; @@ -97,6 +102,117 @@ test_consume_xids(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } + +/* + * test_consume_cpu(seconds int). Keeps one CPU busy for the given number of seconds. + */ +Datum +test_consume_cpu(PG_FUNCTION_ARGS) +{ + int32 seconds = PG_GETARG_INT32(0); + TimestampTz start; + uint64 total_iterations = 0; + + start = GetCurrentTimestamp(); + + for (;;) + { + TimestampTz elapsed; + + elapsed = GetCurrentTimestamp() - start; + if (elapsed > (TimestampTz) seconds * USECS_PER_SEC) + break; + + /* keep spinning */ + for (int i = 0; i < 1000000; i++) + total_iterations++; + elog(DEBUG2, "test_consume_cpu(): %lu iterations in total", total_iterations); + + CHECK_FOR_INTERRUPTS(); + } + + PG_RETURN_VOID(); +} + +static MemoryContext consume_cxt = NULL; +static slist_head consumed_memory_chunks; +static int64 num_memory_chunks; + +/* + * test_consume_memory(megabytes int). + * + * Consume given amount of memory. The allocation is made in TopMemoryContext, + * so it outlives the function, until you call test_release_memory to + * explicitly release it, or close the session. + */ +Datum +test_consume_memory(PG_FUNCTION_ARGS) +{ + int32 megabytes = PG_GETARG_INT32(0); + + /* + * Consume the memory in a new memory context, so that it's convenient to + * release and to display it separately in a possible memory context dump. + */ + if (consume_cxt == NULL) + consume_cxt = AllocSetContextCreate(TopMemoryContext, + "test_consume_memory", + ALLOCSET_DEFAULT_SIZES); + + for (int32 i = 0; i < megabytes; i++) + { + char *p; + + p = MemoryContextAllocZero(consume_cxt, 1024 * 1024); + + /* touch the memory, so that it's really allocated by the kernel */ + for (int j = 0; j < 1024 * 1024; j += 1024) + p[j] = j % 0xFF; + + slist_push_head(&consumed_memory_chunks, (slist_node *) p); + num_memory_chunks++; + } + + PG_RETURN_VOID(); +} + +/* + * test_release_memory(megabytes int). NULL releases all + */ +Datum +test_release_memory(PG_FUNCTION_ARGS) +{ + if (PG_ARGISNULL(0)) + { + if (consume_cxt) + { + MemoryContextDelete(consume_cxt); + consume_cxt = NULL; + num_memory_chunks = 0; + } + } + else + { + int32 chunks_to_release = PG_GETARG_INT32(0); + + if (chunks_to_release > num_memory_chunks) + { + elog(WARNING, "only %lu MB is consumed, releasing it all", num_memory_chunks); + chunks_to_release = num_memory_chunks; + } + + for (int32 i = 0; i < chunks_to_release; i++) + { + slist_node *chunk = slist_pop_head_node(&consumed_memory_chunks); + + pfree(chunk); + num_memory_chunks--; + } + } + + PG_RETURN_VOID(); +} + /* * Flush the buffer cache, evicting all pages that are not currently pinned. */ @@ -182,9 +298,10 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) text *relname; text *forkname; uint32 blkno; + neon_request_lsns request_lsns; - bool request_latest = PG_ARGISNULL(3); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3); + if (PG_NARGS() != 5) + elog(ERROR, "unexpected number of arguments in SQL function signature"); if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) PG_RETURN_NULL(); @@ -193,6 +310,16 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) forkname = PG_GETARG_TEXT_PP(1); blkno = PG_GETARG_UINT32(2); + request_lsns.request_lsn = PG_ARGISNULL(3) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(3); + request_lsns.not_modified_since = PG_ARGISNULL(4) ? request_lsns.request_lsn : PG_GETARG_LSN(4); + /* + * For the time being, use the same LSN for request and + * effective request LSN. If any test needed to use UINT64_MAX + * as the request LSN, we'd need to add effective_request_lsn + * as a new argument. + */ + request_lsns.effective_request_lsn = request_lsns.request_lsn; + if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -245,7 +372,8 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS) SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(InfoFromRelation(rel), forknum, blkno, request_lsns, + raw_page_data); relation_close(rel, AccessShareLock); @@ -264,6 +392,9 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) { char *raw_page_data; + if (PG_NARGS() != 7) + elog(ERROR, "unexpected number of arguments in SQL function signature"); + if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), @@ -287,18 +418,26 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) }; ForkNumber forknum = PG_GETARG_UINT32(3); - uint32 blkno = PG_GETARG_UINT32(4); - bool request_latest = PG_ARGISNULL(5); - uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5); + neon_request_lsns request_lsns; /* Initialize buffer to copy to */ bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); + request_lsns.request_lsn = PG_ARGISNULL(5) ? GetXLogInsertRecPtr() : PG_GETARG_LSN(5); + request_lsns.not_modified_since = PG_ARGISNULL(6) ? request_lsns.request_lsn : PG_GETARG_LSN(6); + /* + * For the time being, use the same LSN for request + * and effective request LSN. If any test needed to + * use UINT64_MAX as the request LSN, we'd need to add + * effective_request_lsn as a new argument. + */ + request_lsns.effective_request_lsn = request_lsns.request_lsn; + SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); raw_page_data = VARDATA(raw_page); - neon_read_at_lsn(rinfo, forknum, blkno, read_lsn, request_latest, raw_page_data); + neon_read_at_lsn(rinfo, forknum, blkno, request_lsns, raw_page_data); PG_RETURN_BYTEA_P(raw_page); } } diff --git a/pgxn/neon_walredo/walredoproc.c b/pgxn/neon_walredo/walredoproc.c index bdc50b0aa9..c4ab22636b 100644 --- a/pgxn/neon_walredo/walredoproc.c +++ b/pgxn/neon_walredo/walredoproc.c @@ -140,9 +140,45 @@ static XLogReaderState *reader_state; #define TRACE DEBUG5 #ifdef HAVE_LIBSECCOMP + + +/* + * https://man7.org/linux/man-pages/man2/close_range.2.html + * + * The `close_range` syscall is available as of Linux 5.9. + * + * The `close_range` libc wrapper is only available in glibc >= 2.34. + * Debian Bullseye ships a libc package based on glibc 2.31. + * => write the wrapper ourselves, using the syscall number from the kernel headers. + * + * If the Linux uAPI headers don't define the system call number, + * fail the build deliberately rather than ifdef'ing it to ENOSYS. + * We prefer a compile time over a runtime error for walredo. + */ +#include +#include +#include + +static int +close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flags) +{ + return syscall(__NR_close_range, start_fd, count, flags); +} + static void enter_seccomp_mode(void) { + + /* + * The pageserver process relies on us to close all the file descriptors + * it potentially leaked to us, _before_ we start processing potentially dangerous + * wal records. See the comment in the Rust code that launches this process. + */ + int err; + if (err = close_range_syscall(3, ~0U, 0)) { + ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3"))); + } + PgSeccompRule syscalls[] = { /* Hard requirements */ @@ -184,6 +220,9 @@ enter_seccomp_mode(void) } #endif /* HAVE_LIBSECCOMP */ +PGDLLEXPORT void +WalRedoMain(int argc, char *argv[]); + /* * Entry point for the WAL redo process. * @@ -771,6 +810,9 @@ ApplyRecord(StringInfo input_message) ErrorContextCallback errcallback; #if PG_VERSION_NUM >= 150000 DecodedXLogRecord *decoded; +#define STATIC_DECODEBUF_SIZE (64 * 1024) + static char *static_decodebuf = NULL; + size_t required_space; #endif /* @@ -800,7 +842,19 @@ ApplyRecord(StringInfo input_message) XLogBeginRead(reader_state, lsn); #if PG_VERSION_NUM >= 150000 - decoded = (DecodedXLogRecord *) XLogReadRecordAlloc(reader_state, record->xl_tot_len, true); + /* + * For reasonably small records, reuse a fixed size buffer to reduce + * palloc overhead. + */ + required_space = DecodeXLogRecordRequiredSpace(record->xl_tot_len); + if (required_space <= STATIC_DECODEBUF_SIZE) + { + if (static_decodebuf == NULL) + static_decodebuf = MemoryContextAlloc(TopMemoryContext, STATIC_DECODEBUF_SIZE); + decoded = (DecodedXLogRecord *) static_decodebuf; + } + else + decoded = palloc(required_space); if (!DecodeXLogRecord(reader_state, decoded, record, lsn, &errormsg)) elog(ERROR, "failed to decode WAL record: %s", errormsg); @@ -809,37 +863,15 @@ ApplyRecord(StringInfo input_message) /* Record the location of the next record. */ decoded->next_lsn = reader_state->NextRecPtr; - /* - * If it's in the decode buffer, mark the decode buffer space as - * occupied. - */ - if (!decoded->oversized) - { - /* The new decode buffer head must be MAXALIGNed. */ - Assert(decoded->size == MAXALIGN(decoded->size)); - if ((char *) decoded == reader_state->decode_buffer) - reader_state->decode_buffer_tail = reader_state->decode_buffer + decoded->size; - else - reader_state->decode_buffer_tail += decoded->size; - } - - /* Insert it into the queue of decoded records. */ - Assert(reader_state->decode_queue_tail != decoded); - if (reader_state->decode_queue_tail) - reader_state->decode_queue_tail->next = decoded; - reader_state->decode_queue_tail = decoded; - if (!reader_state->decode_queue_head) - reader_state->decode_queue_head = decoded; - /* * Update the pointers to the beginning and one-past-the-end of this * record, again for the benefit of historical code that expected the * decoder to track this rather than accessing these fields of the record * itself. */ - reader_state->record = reader_state->decode_queue_head; - reader_state->ReadRecPtr = reader_state->record->lsn; - reader_state->EndRecPtr = reader_state->record->next_lsn; + reader_state->record = decoded; + reader_state->ReadRecPtr = decoded->lsn; + reader_state->EndRecPtr = decoded->next_lsn; } #else /* @@ -879,8 +911,9 @@ ApplyRecord(StringInfo input_message) elog(TRACE, "applied WAL record with LSN %X/%X", (uint32) (lsn >> 32), (uint32) lsn); + #if PG_VERSION_NUM >= 150000 - if (decoded && decoded->oversized) + if ((char *) decoded != static_decodebuf) pfree(decoded); #endif } diff --git a/poetry.lock b/poetry.lock index 428698cb5a..7740388fb8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,88 +1,88 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiohttp" -version = "3.9.0" +version = "3.9.4" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.8" files = [ - {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"}, - {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"}, - {file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"}, - {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"}, - {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"}, - {file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"}, - {file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"}, - {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"}, - {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"}, - {file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"}, - {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"}, - {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"}, - {file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"}, - {file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"}, - {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"}, - {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"}, - {file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"}, - {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"}, - {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"}, - {file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"}, - {file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"}, - {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"}, - {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"}, - {file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"}, - {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"}, - {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"}, - {file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"}, - {file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"}, - {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"}, - {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"}, - {file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"}, - {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"}, - {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"}, - {file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"}, - {file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"}, - {file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"}, + {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"}, + {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"}, + {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"}, + {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"}, + {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"}, + {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"}, + {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"}, + {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"}, + {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"}, + {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"}, + {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"}, + {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"}, + {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"}, + {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"}, + {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"}, + {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"}, + {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"}, + {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"}, + {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"}, + {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"}, + {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"}, + {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"}, + {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"}, + {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"}, + {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"}, + {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"}, + {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"}, + {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"}, + {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"}, + {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"}, + {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"}, + {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"}, + {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"}, + {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"}, + {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"}, + {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"}, ] [package.dependencies] @@ -158,6 +158,50 @@ files = [ attrs = ">=16.0.0" pluggy = ">=0.4.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] + +[[package]] +name = "antlr4-python3-runtime" +version = "4.13.1" +description = "ANTLR 4.13.1 runtime for Python 3" +optional = false +python-versions = "*" +files = [ + {file = "antlr4-python3-runtime-4.13.1.tar.gz", hash = "sha256:3cd282f5ea7cfb841537fe01f143350fdb1c0b1ce7981443a2fa8513fddb6d1a"}, + {file = "antlr4_python3_runtime-4.13.1-py3-none-any.whl", hash = "sha256:78ec57aad12c97ac039ca27403ad61cb98aaec8a3f9bb8144f889aa0fa28b943"}, +] + +[[package]] +name = "anyio" +version = "4.3.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.8" +files = [ + {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"}, + {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (>=0.23)"] + [[package]] name = "async-timeout" version = "4.0.3" @@ -245,22 +289,23 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy" [[package]] name = "aws-sam-translator" -version = "1.48.0" +version = "1.88.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" optional = false -python-versions = ">=3.7, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "aws-sam-translator-1.48.0.tar.gz", hash = "sha256:7171037323dfa30f8f73e9bccb9210e4c384a585e087219a9518a5204f0a2c44"}, - {file = "aws_sam_translator-1.48.0-py2-none-any.whl", hash = "sha256:be18dfa3dfe7ab291d281667c5f73ac62dbe6bfe86df7d122e4258b906b736f0"}, - {file = "aws_sam_translator-1.48.0-py3-none-any.whl", hash = "sha256:ca4f8f9910d7713aeaba59346775bfb3198f6acb47c6704572f9bd3fc0fb5bf0"}, + {file = "aws_sam_translator-1.88.0-py3-none-any.whl", hash = "sha256:aa93d498d8de3fb3d485c316155b1628144b823bbc176099a20de06df666fcac"}, + {file = "aws_sam_translator-1.88.0.tar.gz", hash = "sha256:e77c65f3488566122277accd44a0f1ec018e37403e0d5fe25120d96e537e91a7"}, ] [package.dependencies] boto3 = ">=1.19.5,<2.dev0" -jsonschema = ">=3.2,<4.0" +jsonschema = ">=3.2,<5" +pydantic = ">=1.8,<3" +typing-extensions = ">=4.4" [package.extras] -dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)"] +dev = ["black (==24.3.0)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.dev0)", "coverage (>=5.3,<8)", "dateparser (>=1.1,<2.0)", "mypy (>=1.3.0,<1.4.0)", "parameterized (>=0.7,<1.0)", "pytest (>=6.2,<8)", "pytest-cov (>=2.10,<5)", "pytest-env (>=0.6,<1)", "pytest-rerunfailures (>=9.1,<12)", "pytest-xdist (>=2.5,<4)", "pyyaml (>=6.0,<7.0)", "requests (>=2.28,<3.0)", "ruamel.yaml (==0.17.21)", "ruff (>=0.1.0,<0.2.0)", "tenacity (>=8.0,<9.0)", "types-PyYAML (>=6.0,<7.0)", "types-jsonschema (>=3.2,<4.0)"] [[package]] name = "aws-xray-sdk" @@ -776,24 +821,26 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.61.3" +version = "0.87.1" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" optional = false -python-versions = ">=3.6, <=4.0, !=4.0" +python-versions = "!=4.0,<=4.0,>=3.8" files = [ - {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"}, - {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"}, + {file = "cfn_lint-0.87.1-py3-none-any.whl", hash = "sha256:d450f450635fc223b6f66880ccac52a5fd1a52966fa1705f1ba52b88dfed3071"}, + {file = "cfn_lint-0.87.1.tar.gz", hash = "sha256:b3ce9d3e5e0eadcea5d584c8ccaa00bf2a990a36a64d7ffd8683bc60b7e4f06f"}, ] [package.dependencies] -aws-sam-translator = ">=1.47.0" +aws-sam-translator = ">=1.87.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" -jsonschema = ">=3.0,<4.0" +jsonschema = ">=3.0,<5" junit-xml = ">=1.9,<2.0" -networkx = ">=2.4,<3.0" +networkx = ">=2.4,<4" pyyaml = ">5.4" +regex = ">=2021.7.1" sarif-om = ">=1.0.4,<1.1.0" +sympy = ">=1.0.0" [[package]] name = "charset-normalizer" @@ -836,47 +883,56 @@ files = [ [[package]] name = "cryptography" -version = "41.0.6" +version = "42.0.4" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" files = [ - {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"}, - {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"}, - {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"}, - {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"}, - {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"}, - {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"}, - {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"}, - {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"}, - {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"}, - {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"}, - {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"}, + {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"}, + {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"}, + {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"}, + {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"}, + {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"}, + {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"}, + {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"}, + {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"}, + {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"}, + {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"}, + {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"}, + {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"}, + {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"}, + {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"}, + {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"}, + {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"}, + {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"}, + {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"}, + {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"}, + {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"}, + {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"}, + {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"}, + {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"}, + {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"}, + {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"}, + {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"}, + {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"}, + {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"}, + {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"}, + {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"}, + {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"}, + {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"}, ] [package.dependencies] -cffi = ">=1.12" +cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} [package.extras] docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] -docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] +docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] nox = ["nox"] -pep8test = ["black", "check-sdist", "mypy", "ruff"] +pep8test = ["check-sdist", "click", "mypy", "ruff"] sdist = ["build"] ssh = ["bcrypt (>=3.1.5)"] -test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] [[package]] @@ -900,24 +956,6 @@ websocket-client = ">=0.32.0" ssh = ["paramiko (>=2.4.2)"] tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"] -[[package]] -name = "ecdsa" -version = "0.18.0" -description = "ECDSA cryptographic signature library (pure python)" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "ecdsa-0.18.0-py2.py3-none-any.whl", hash = "sha256:80600258e7ed2f16b9aa1d7c295bd70194109ad5a30fdee0eaeefef1d4c559dd"}, - {file = "ecdsa-0.18.0.tar.gz", hash = "sha256:190348041559e21b22a1d65cee485282ca11a6f81d503fddb84d5017e9ed1e49"}, -] - -[package.dependencies] -six = ">=1.9.0" - -[package.extras] -gmpy = ["gmpy"] -gmpy2 = ["gmpy2"] - [[package]] name = "exceptiongroup" version = "1.1.1" @@ -970,18 +1008,17 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "3.0.10" +version = "4.0.1" description = "A Flask extension adding a decorator for CORS support" optional = false python-versions = "*" files = [ - {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"}, - {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"}, + {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"}, + {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"}, ] [package.dependencies] Flask = ">=0.9" -Six = "*" [[package]] name = "frozenlist" @@ -1064,15 +1101,109 @@ files = [ {file = "graphql_core-3.2.1-py3-none-any.whl", hash = "sha256:f83c658e4968998eed1923a2e3e3eddd347e005ac0315fbb7ca4d70ea9156323"}, ] +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "h2" +version = "4.1.0" +description = "HTTP/2 State-Machine based protocol implementation" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"}, + {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"}, +] + +[package.dependencies] +hpack = ">=4.0,<5" +hyperframe = ">=6.0,<7" + +[[package]] +name = "hpack" +version = "4.0.0" +description = "Pure-Python HPACK header compression" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"}, + {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, +] + +[[package]] +name = "httpcore" +version = "1.0.3" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.3-py3-none-any.whl", hash = "sha256:9a6a501c3099307d9fd76ac244e08503427679b1e81ceb1d922485e2f2462ad2"}, + {file = "httpcore-1.0.3.tar.gz", hash = "sha256:5c0f9546ad17dac4d0772b0808856eb616eb8b48ce94f49ed819fd6982a8a544"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.13,<0.15" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<0.24.0)"] + +[[package]] +name = "httpx" +version = "0.26.0" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-0.26.0-py3-none-any.whl", hash = "sha256:8915f5a3627c4d47b73e8202457cb28f1266982d1159bd5779d86a80c0eab1cd"}, + {file = "httpx-0.26.0.tar.gz", hash = "sha256:451b55c30d5185ea6b23c2c793abf9bb237d2a7dfb901ced6ff69ad37ec1dfaf"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""} +httpcore = "==1.*" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + +[[package]] +name = "hyperframe" +version = "6.0.1" +description = "HTTP/2 framing layer for Python" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"}, + {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"}, +] + [[package]] name = "idna" -version = "3.3" +version = "3.7" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" files = [ - {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"}, - {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"}, + {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, + {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] [[package]] @@ -1118,13 +1249,13 @@ files = [ [[package]] name = "jinja2" -version = "3.1.2" +version = "3.1.4" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" files = [ - {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, - {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, ] [package.dependencies] @@ -1144,6 +1275,23 @@ files = [ {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] +[[package]] +name = "joserfc" +version = "0.9.0" +description = "The ultimate Python library for JOSE RFCs, including JWS, JWE, JWK, JWA, JWT" +optional = false +python-versions = ">=3.8" +files = [ + {file = "joserfc-0.9.0-py3-none-any.whl", hash = "sha256:4026bdbe2c196cd40574e916fa1e28874d99649412edaab0e373dec3077153fb"}, + {file = "joserfc-0.9.0.tar.gz", hash = "sha256:eebca7f587b1761ce43a98ffd5327f2b600b9aa5bb0a77b947687f503ad43bc0"}, +] + +[package.dependencies] +cryptography = "*" + +[package.extras] +drafts = ["pycryptodome"] + [[package]] name = "jschema-to-python" version = "1.2.3" @@ -1185,6 +1333,20 @@ files = [ [package.dependencies] jsonpointer = ">=1.9" +[[package]] +name = "jsonpath-ng" +version = "1.6.1" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +optional = false +python-versions = "*" +files = [ + {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"}, + {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"}, +] + +[package.dependencies] +ply = "*" + [[package]] name = "jsonpickle" version = "2.2.0" @@ -1214,24 +1376,39 @@ files = [ [[package]] name = "jsonschema" -version = "3.2.0" +version = "4.17.3" description = "An implementation of JSON Schema validation for Python" optional = false -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "jsonschema-3.2.0-py2.py3-none-any.whl", hash = "sha256:4e5b3cf8216f577bee9ce139cbe72eca3ea4f292ec60928ff24758ce626cd163"}, - {file = "jsonschema-3.2.0.tar.gz", hash = "sha256:c8a85b28d377cc7737e46e2d9f2b4f44ee3c0e1deac6bf46ddefc7187d30797a"}, + {file = "jsonschema-4.17.3-py3-none-any.whl", hash = "sha256:a870ad254da1a8ca84b6a2905cac29d265f805acc57af304784962a2aa6508f6"}, + {file = "jsonschema-4.17.3.tar.gz", hash = "sha256:0f864437ab8b6076ba6707453ef8f98a6a0d512a80e93f8abdb676f737ecb60d"}, ] [package.dependencies] attrs = ">=17.4.0" -pyrsistent = ">=0.14.0" -setuptools = "*" -six = ">=1.11.0" +pyrsistent = ">=0.14.0,<0.17.0 || >0.17.0,<0.17.1 || >0.17.1,<0.17.2 || >0.17.2" [package.extras] -format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"] -format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] + +[[package]] +name = "jsonschema-spec" +version = "0.1.6" +description = "JSONSchema Spec with object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "jsonschema_spec-0.1.6-py3-none-any.whl", hash = "sha256:f2206d18c89d1824c1f775ba14ed039743b41a9167bd2c5bdb774b66b3ca0bbf"}, + {file = "jsonschema_spec-0.1.6.tar.gz", hash = "sha256:90215863b56e212086641956b20127ccbf6d8a3a38343dad01d6a74d19482f76"}, +] + +[package.dependencies] +jsonschema = ">=4.0.0,<4.18.0" +pathable = ">=0.4.1,<0.5.0" +PyYAML = ">=5.1" +requests = ">=2.31.0,<3.0.0" [[package]] name = "junit-xml" @@ -1247,6 +1424,52 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "lazy-object-proxy" +version = "1.10.0" +description = "A fast and thorough lazy object proxy." +optional = false +python-versions = ">=3.8" +files = [ + {file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"}, + {file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"}, + {file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"}, + {file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"}, + {file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"}, + {file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"}, + {file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"}, +] + [[package]] name = "markupsafe" version = "2.1.1" @@ -1298,64 +1521,80 @@ files = [ [[package]] name = "moto" -version = "4.1.2" +version = "5.0.6" description = "" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"}, - {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"}, + {file = "moto-5.0.6-py2.py3-none-any.whl", hash = "sha256:ca1e22831a741733b581ff2ef4d6ae2e1c6db1eab97af1b78b86ca2c6e88c609"}, + {file = "moto-5.0.6.tar.gz", hash = "sha256:ad8b23f2b555ad694da8b2432a42b6d96beaaf67a4e7d932196a72193a2eee2c"}, ] [package.dependencies] +antlr4-python3-runtime = {version = "*", optional = true, markers = "extra == \"server\""} aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""} boto3 = ">=1.9.201" -botocore = ">=1.12.201" +botocore = ">=1.14.0" cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""} cryptography = ">=3.3.1" -docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} -ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} +docker = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""} flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" +joserfc = {version = ">=0.9.0", optional = true, markers = "extra == \"server\""} jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} -openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} +jsonpath-ng = {version = "*", optional = true, markers = "extra == \"server\""} +openapi-spec-validator = {version = ">=0.5.0", optional = true, markers = "extra == \"server\""} +py-partiql-parser = {version = "0.5.4", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" -python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" -responses = ">=0.13.0" +responses = ">=0.15.0" setuptools = {version = "*", optional = true, markers = "extra == \"server\""} -sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -apigatewayv2 = ["PyYAML (>=5.1)"] +all = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +apigateway = ["PyYAML (>=5.1)", "joserfc (>=0.9.0)", "openapi-spec-validator (>=0.5.0)"] +apigatewayv2 = ["PyYAML (>=5.1)", "openapi-spec-validator (>=0.5.0)"] appsync = ["graphql-core"] -awslambda = ["docker (>=2.5.1)"] -batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] -ds = ["sshpubkeys (>=3.1.0)"] -dynamodb = ["docker (>=2.5.1)"] -dynamodbstreams = ["docker (>=2.5.1)"] -ebs = ["sshpubkeys (>=3.1.0)"] -ec2 = ["sshpubkeys (>=3.1.0)"] -efs = ["sshpubkeys (>=3.1.0)"] -eks = ["sshpubkeys (>=3.1.0)"] +awslambda = ["docker (>=3.0.0)"] +batch = ["docker (>=3.0.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +cognitoidp = ["joserfc (>=0.9.0)"] +dynamodb = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] +dynamodbstreams = ["docker (>=3.0.0)", "py-partiql-parser (==0.5.4)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] -route53resolver = ["sshpubkeys (>=3.1.0)"] -s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +proxy = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "multipart", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] +resourcegroupstaggingapi = ["PyYAML (>=5.1)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)"] +s3 = ["PyYAML (>=5.1)", "py-partiql-parser (==0.5.4)"] +s3crc32c = ["PyYAML (>=5.1)", "crc32c", "py-partiql-parser (==0.5.4)"] +server = ["PyYAML (>=5.1)", "antlr4-python3-runtime", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=3.0.0)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "joserfc (>=0.9.0)", "jsondiff (>=1.1.2)", "jsonpath-ng", "openapi-spec-validator (>=0.5.0)", "py-partiql-parser (==0.5.4)", "pyparsing (>=3.0.7)", "setuptools"] ssm = ["PyYAML (>=5.1)"] +stepfunctions = ["antlr4-python3-runtime", "jsonpath-ng"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] +[[package]] +name = "mpmath" +version = "1.3.0" +description = "Python library for arbitrary-precision floating-point arithmetic" +optional = false +python-versions = "*" +files = [ + {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, + {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, +] + +[package.extras] +develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] +docs = ["sphinx"] +gmpy = ["gmpy2 (>=2.1.0a4)"] +tests = ["pytest (>=4.6)"] + [[package]] name = "multidict" version = "6.0.4" @@ -1530,42 +1769,38 @@ test = ["codecov (>=2.1)", "pytest (>=7.1)", "pytest-cov (>=3.0)"] [[package]] name = "openapi-schema-validator" -version = "0.2.3" +version = "0.4.4" description = "OpenAPI schema validation for Python" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-schema-validator-0.2.3.tar.gz", hash = "sha256:2c64907728c3ef78e23711c8840a423f0b241588c9ed929855e4b2d1bb0cf5f2"}, - {file = "openapi_schema_validator-0.2.3-py3-none-any.whl", hash = "sha256:9bae709212a19222892cabcc60cafd903cbf4b220223f48583afa3c0e3cc6fc4"}, + {file = "openapi_schema_validator-0.4.4-py3-none-any.whl", hash = "sha256:79f37f38ef9fd5206b924ed7a6f382cea7b649b3b56383c47f1906082b7b9015"}, + {file = "openapi_schema_validator-0.4.4.tar.gz", hash = "sha256:c573e2be2c783abae56c5a1486ab716ca96e09d1c3eab56020d1dc680aa57bf8"}, ] [package.dependencies] -jsonschema = ">=3.0.0,<5.0.0" +jsonschema = ">=4.0.0,<4.18.0" +rfc3339-validator = "*" [package.extras] -isodate = ["isodate"] -rfc3339-validator = ["rfc3339-validator"] -strict-rfc3339 = ["strict-rfc3339"] +docs = ["sphinx (>=5.3.0,<6.0.0)", "sphinx-immaterial (>=0.11.0,<0.12.0)"] [[package]] name = "openapi-spec-validator" -version = "0.4.0" -description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3.0 spec validator" +version = "0.5.7" +description = "OpenAPI 2.0 (aka Swagger) and OpenAPI 3 spec validator" optional = false python-versions = ">=3.7.0,<4.0.0" files = [ - {file = "openapi-spec-validator-0.4.0.tar.gz", hash = "sha256:97f258850afc97b048f7c2653855e0f88fa66ac103c2be5077c7960aca2ad49a"}, - {file = "openapi_spec_validator-0.4.0-py3-none-any.whl", hash = "sha256:06900ac4d546a1df3642a779da0055be58869c598e3042a2fef067cfd99d04d0"}, + {file = "openapi_spec_validator-0.5.7-py3-none-any.whl", hash = "sha256:8712d2879db7692974ef89c47a3ebfc79436442921ec3a826ac0ce80cde8c549"}, + {file = "openapi_spec_validator-0.5.7.tar.gz", hash = "sha256:6c2d42180045a80fd6314de848b94310bdb0fa4949f4b099578b69f79d9fa5ac"}, ] [package.dependencies] -jsonschema = ">=3.2.0,<5.0.0" -openapi-schema-validator = ">=0.2.0,<0.3.0" -PyYAML = ">=5.1" -setuptools = "*" - -[package.extras] -requests = ["requests"] +jsonschema = ">=4.0.0,<4.18.0" +jsonschema-spec = ">=0.1.1,<0.2.0" +lazy-object-proxy = ">=1.7.1,<2.0.0" +openapi-schema-validator = ">=0.4.2,<0.5.0" [[package]] name = "packaging" @@ -1578,6 +1813,17 @@ files = [ {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, ] +[[package]] +name = "pathable" +version = "0.4.3" +description = "Object-oriented paths" +optional = false +python-versions = ">=3.7.0,<4.0.0" +files = [ + {file = "pathable-0.4.3-py3-none-any.whl", hash = "sha256:cdd7b1f9d7d5c8b8d3315dbf5a86b2596053ae845f056f57d97c0eefff84da14"}, + {file = "pathable-0.4.3.tar.gz", hash = "sha256:5c869d315be50776cc8a993f3af43e0c60dc01506b399643f919034ebf4cdcab"}, +] + [[package]] name = "pbr" version = "5.9.0" @@ -1604,6 +1850,17 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = false +python-versions = "*" +files = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] + [[package]] name = "prometheus-client" version = "0.14.1" @@ -1716,16 +1973,19 @@ files = [ ] [[package]] -name = "pyasn1" -version = "0.4.8" -description = "ASN.1 types and codecs" +name = "py-partiql-parser" +version = "0.5.4" +description = "Pure Python PartiQL Parser" optional = false python-versions = "*" files = [ - {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, - {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, + {file = "py_partiql_parser-0.5.4-py2.py3-none-any.whl", hash = "sha256:3dc4295a47da9587681a96b35c6e151886fdbd0a4acbe0d97c4c68e5f689d315"}, + {file = "py_partiql_parser-0.5.4.tar.gz", hash = "sha256:72e043919538fa63edae72fb59afc7e3fd93adbde656718a7d2b4666f23dd114"}, ] +[package.extras] +dev = ["black (==22.6.0)", "flake8", "mypy", "pytest"] + [[package]] name = "pycparser" version = "2.21" @@ -1737,6 +1997,116 @@ files = [ {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +[[package]] +name = "pydantic" +version = "2.7.1" +description = "Data validation using Python type hints" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"}, + {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"}, +] + +[package.dependencies] +annotated-types = ">=0.4.0" +pydantic-core = "2.18.2" +typing-extensions = ">=4.6.1" + +[package.extras] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.18.2" +description = "Core functionality for Pydantic validation and serialization" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"}, + {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"}, + {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"}, + {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"}, + {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"}, + {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"}, + {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"}, + {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"}, + {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"}, + {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"}, + {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"}, + {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"}, + {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"}, + {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"}, + {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"}, + {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"}, + {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"}, + {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"}, + {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"}, + {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"}, + {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"}, + {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"}, + {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"}, + {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"}, + {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"}, + {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"}, + {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"}, + {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"}, + {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"}, + {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"}, + {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"}, +] + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" + [[package]] name = "pyjwt" version = "2.4.0" @@ -1900,6 +2270,20 @@ pytest = [ {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] +[[package]] +name = "pytest-repeat" +version = "0.9.3" +description = "pytest plugin for repeating tests" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest_repeat-0.9.3-py3-none-any.whl", hash = "sha256:26ab2df18226af9d5ce441c858f273121e92ff55f5bb311d25755b8d7abdd8ed"}, + {file = "pytest_repeat-0.9.3.tar.gz", hash = "sha256:ffd3836dfcd67bb270bec648b330e20be37d2966448c4148c4092d1e8aba8185"}, +] + +[package.dependencies] +pytest = "*" + [[package]] name = "pytest-rerunfailures" version = "13.0" @@ -1977,28 +2361,6 @@ files = [ [package.dependencies] six = ">=1.5" -[[package]] -name = "python-jose" -version = "3.3.0" -description = "JOSE implementation in Python" -optional = false -python-versions = "*" -files = [ - {file = "python-jose-3.3.0.tar.gz", hash = "sha256:55779b5e6ad599c6336191246e95eb2293a9ddebd555f796a65f838f07e5d78a"}, - {file = "python_jose-3.3.0-py2.py3-none-any.whl", hash = "sha256:9b1376b023f8b298536eedd47ae1089bcdb848f1535ab30555cd92002d78923a"}, -] - -[package.dependencies] -cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"cryptography\""} -ecdsa = "!=0.15" -pyasn1 = "*" -rsa = "*" - -[package.extras] -cryptography = ["cryptography (>=3.4.0)"] -pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] -pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] - [[package]] name = "pywin32" version = "301" @@ -2043,6 +2405,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2077,15 +2440,103 @@ files = [ {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] +[[package]] +name = "regex" +version = "2024.4.28" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.8" +files = [ + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd196d056b40af073d95a2879678585f0b74ad35190fac04ca67954c582c6b61"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8bb381f777351bd534462f63e1c6afb10a7caa9fa2a421ae22c26e796fe31b1f"}, + {file = "regex-2024.4.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:47af45b6153522733aa6e92543938e97a70ce0900649ba626cf5aad290b737b6"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99d6a550425cc51c656331af0e2b1651e90eaaa23fb4acde577cf15068e2e20f"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bf29304a8011feb58913c382902fde3395957a47645bf848eea695839aa101b7"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:92da587eee39a52c91aebea8b850e4e4f095fe5928d415cb7ed656b3460ae79a"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6277d426e2f31bdbacb377d17a7475e32b2d7d1f02faaecc48d8e370c6a3ff31"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:28e1f28d07220c0f3da0e8fcd5a115bbb53f8b55cecf9bec0c946eb9a059a94c"}, + {file = "regex-2024.4.28-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:aaa179975a64790c1f2701ac562b5eeb733946eeb036b5bcca05c8d928a62f10"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6f435946b7bf7a1b438b4e6b149b947c837cb23c704e780c19ba3e6855dbbdd3"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:19d6c11bf35a6ad077eb23852827f91c804eeb71ecb85db4ee1386825b9dc4db"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:fdae0120cddc839eb8e3c15faa8ad541cc6d906d3eb24d82fb041cfe2807bc1e"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e672cf9caaf669053121f1766d659a8813bd547edef6e009205378faf45c67b8"}, + {file = "regex-2024.4.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f57515750d07e14743db55d59759893fdb21d2668f39e549a7d6cad5d70f9fea"}, + {file = "regex-2024.4.28-cp310-cp310-win32.whl", hash = "sha256:a1409c4eccb6981c7baabc8888d3550df518add6e06fe74fa1d9312c1838652d"}, + {file = "regex-2024.4.28-cp310-cp310-win_amd64.whl", hash = "sha256:1f687a28640f763f23f8a9801fe9e1b37338bb1ca5d564ddd41619458f1f22d1"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:84077821c85f222362b72fdc44f7a3a13587a013a45cf14534df1cbbdc9a6796"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b45d4503de8f4f3dc02f1d28a9b039e5504a02cc18906cfe744c11def942e9eb"}, + {file = "regex-2024.4.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:457c2cd5a646dd4ed536c92b535d73548fb8e216ebee602aa9f48e068fc393f3"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b51739ddfd013c6f657b55a508de8b9ea78b56d22b236052c3a85a675102dc6"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:459226445c7d7454981c4c0ce0ad1a72e1e751c3e417f305722bbcee6697e06a"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:670fa596984b08a4a769491cbdf22350431970d0112e03d7e4eeaecaafcd0fec"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe00f4fe11c8a521b173e6324d862ee7ee3412bf7107570c9b564fe1119b56fb"}, + {file = "regex-2024.4.28-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:36f392dc7763fe7924575475736bddf9ab9f7a66b920932d0ea50c2ded2f5636"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:23a412b7b1a7063f81a742463f38821097b6a37ce1e5b89dd8e871d14dbfd86b"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f1d6e4b7b2ae3a6a9df53efbf199e4bfcff0959dbdb5fd9ced34d4407348e39a"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:499334ad139557de97cbc4347ee921c0e2b5e9c0f009859e74f3f77918339257"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:0940038bec2fe9e26b203d636c44d31dd8766abc1fe66262da6484bd82461ccf"}, + {file = "regex-2024.4.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:66372c2a01782c5fe8e04bff4a2a0121a9897e19223d9eab30c54c50b2ebeb7f"}, + {file = "regex-2024.4.28-cp311-cp311-win32.whl", hash = "sha256:c77d10ec3c1cf328b2f501ca32583625987ea0f23a0c2a49b37a39ee5c4c4630"}, + {file = "regex-2024.4.28-cp311-cp311-win_amd64.whl", hash = "sha256:fc0916c4295c64d6890a46e02d4482bb5ccf33bf1a824c0eaa9e83b148291f90"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:08a1749f04fee2811c7617fdd46d2e46d09106fa8f475c884b65c01326eb15c5"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b8eb28995771c087a73338f695a08c9abfdf723d185e57b97f6175c5051ff1ae"}, + {file = "regex-2024.4.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dd7ef715ccb8040954d44cfeff17e6b8e9f79c8019daae2fd30a8806ef5435c0"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb0315a2b26fde4005a7c401707c5352df274460f2f85b209cf6024271373013"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f2fc053228a6bd3a17a9b0a3f15c3ab3cf95727b00557e92e1cfe094b88cc662"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7fe9739a686dc44733d52d6e4f7b9c77b285e49edf8570754b322bca6b85b4cc"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74fcf77d979364f9b69fcf8200849ca29a374973dc193a7317698aa37d8b01c"}, + {file = "regex-2024.4.28-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:965fd0cf4694d76f6564896b422724ec7b959ef927a7cb187fc6b3f4e4f59833"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2fef0b38c34ae675fcbb1b5db760d40c3fc3612cfa186e9e50df5782cac02bcd"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bc365ce25f6c7c5ed70e4bc674f9137f52b7dd6a125037f9132a7be52b8a252f"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:ac69b394764bb857429b031d29d9604842bc4cbfd964d764b1af1868eeebc4f0"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:144a1fc54765f5c5c36d6d4b073299832aa1ec6a746a6452c3ee7b46b3d3b11d"}, + {file = "regex-2024.4.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2630ca4e152c221072fd4a56d4622b5ada876f668ecd24d5ab62544ae6793ed6"}, + {file = "regex-2024.4.28-cp312-cp312-win32.whl", hash = "sha256:7f3502f03b4da52bbe8ba962621daa846f38489cae5c4a7b5d738f15f6443d17"}, + {file = "regex-2024.4.28-cp312-cp312-win_amd64.whl", hash = "sha256:0dd3f69098511e71880fb00f5815db9ed0ef62c05775395968299cb400aeab82"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:374f690e1dd0dbdcddea4a5c9bdd97632cf656c69113f7cd6a361f2a67221cb6"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f87ae6b96374db20f180eab083aafe419b194e96e4f282c40191e71980c666"}, + {file = "regex-2024.4.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5dbc1bcc7413eebe5f18196e22804a3be1bfdfc7e2afd415e12c068624d48247"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f85151ec5a232335f1be022b09fbbe459042ea1951d8a48fef251223fc67eee1"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57ba112e5530530fd175ed550373eb263db4ca98b5f00694d73b18b9a02e7185"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:224803b74aab56aa7be313f92a8d9911dcade37e5f167db62a738d0c85fdac4b"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a54a047b607fd2d2d52a05e6ad294602f1e0dec2291152b745870afc47c1397"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a2a512d623f1f2d01d881513af9fc6a7c46e5cfffb7dc50c38ce959f9246c94"}, + {file = "regex-2024.4.28-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c06bf3f38f0707592898428636cbb75d0a846651b053a1cf748763e3063a6925"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1031a5e7b048ee371ab3653aad3030ecfad6ee9ecdc85f0242c57751a05b0ac4"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d7a353ebfa7154c871a35caca7bfd8f9e18666829a1dc187115b80e35a29393e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7e76b9cfbf5ced1aca15a0e5b6f229344d9b3123439ffce552b11faab0114a02"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ce479ecc068bc2a74cb98dd8dba99e070d1b2f4a8371a7dfe631f85db70fe6e"}, + {file = "regex-2024.4.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7d77b6f63f806578c604dca209280e4c54f0fa9a8128bb8d2cc5fb6f99da4150"}, + {file = "regex-2024.4.28-cp38-cp38-win32.whl", hash = "sha256:d84308f097d7a513359757c69707ad339da799e53b7393819ec2ea36bc4beb58"}, + {file = "regex-2024.4.28-cp38-cp38-win_amd64.whl", hash = "sha256:2cc1b87bba1dd1a898e664a31012725e48af826bf3971e786c53e32e02adae6c"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7413167c507a768eafb5424413c5b2f515c606be5bb4ef8c5dee43925aa5718b"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:108e2dcf0b53a7c4ab8986842a8edcb8ab2e59919a74ff51c296772e8e74d0ae"}, + {file = "regex-2024.4.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f1c5742c31ba7d72f2dedf7968998730664b45e38827637e0f04a2ac7de2f5f1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecc6148228c9ae25ce403eade13a0961de1cb016bdb35c6eafd8e7b87ad028b1"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7d893c8cf0e2429b823ef1a1d360a25950ed11f0e2a9df2b5198821832e1947"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4290035b169578ffbbfa50d904d26bec16a94526071ebec3dadbebf67a26b25e"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a22ae1cfd82e4ffa2066eb3390777dc79468f866f0625261a93e44cdf6482b"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd24fd140b69f0b0bcc9165c397e9b2e89ecbeda83303abf2a072609f60239e2"}, + {file = "regex-2024.4.28-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:39fb166d2196413bead229cd64a2ffd6ec78ebab83fff7d2701103cf9f4dfd26"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9301cc6db4d83d2c0719f7fcda37229691745168bf6ae849bea2e85fc769175d"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7c3d389e8d76a49923683123730c33e9553063d9041658f23897f0b396b2386f"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:99ef6289b62042500d581170d06e17f5353b111a15aa6b25b05b91c6886df8fc"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:b91d529b47798c016d4b4c1d06cc826ac40d196da54f0de3c519f5a297c5076a"}, + {file = "regex-2024.4.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:43548ad74ea50456e1c68d3c67fff3de64c6edb85bcd511d1136f9b5376fc9d1"}, + {file = "regex-2024.4.28-cp39-cp39-win32.whl", hash = "sha256:05d9b6578a22db7dedb4df81451f360395828b04f4513980b6bd7a1412c679cc"}, + {file = "regex-2024.4.28-cp39-cp39-win_amd64.whl", hash = "sha256:3986217ec830c2109875be740531feb8ddafe0dfa49767cdcd072ed7e8927962"}, + {file = "regex-2024.4.28.tar.gz", hash = "sha256:83ab366777ea45d58f72593adf35d36ca911ea8bd838483c1823b883a121b0e4"}, +] + [[package]] name = "requests" -version = "2.31.0" +version = "2.32.0" description = "Python HTTP for Humans." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, + {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"}, + {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"}, ] [package.dependencies] @@ -2117,43 +2568,43 @@ urllib3 = ">=1.25.10" tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"] [[package]] -name = "rsa" -version = "4.9" -description = "Pure-Python RSA implementation" +name = "rfc3339-validator" +version = "0.1.4" +description = "A pure python RFC3339 validator" optional = false -python-versions = ">=3.6,<4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ - {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, - {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, + {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, + {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, ] [package.dependencies] -pyasn1 = ">=0.1.3" +six = "*" [[package]] name = "ruff" -version = "0.1.11" +version = "0.2.2" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"}, - {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"}, - {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"}, - {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"}, - {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"}, - {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"}, - {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"}, - {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"}, + {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"}, + {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"}, + {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"}, + {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"}, + {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"}, + {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"}, + {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"}, + {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"}, ] [[package]] @@ -2216,22 +2667,29 @@ files = [ ] [[package]] -name = "sshpubkeys" -version = "3.3.1" -description = "SSH public key parser" +name = "sniffio" +version = "1.3.0" +description = "Sniff out which async library your code is running under" optional = false -python-versions = ">=3" +python-versions = ">=3.7" files = [ - {file = "sshpubkeys-3.3.1-py2.py3-none-any.whl", hash = "sha256:946f76b8fe86704b0e7c56a00d80294e39bc2305999844f079a217885060b1ac"}, - {file = "sshpubkeys-3.3.1.tar.gz", hash = "sha256:3020ed4f8c846849299370fbe98ff4157b0ccc1accec105e07cfa9ae4bb55064"}, + {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, + {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, +] + +[[package]] +name = "sympy" +version = "1.12" +description = "Computer algebra system (CAS) in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, + {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, ] [package.dependencies] -cryptography = ">=2.1.4" -ecdsa = ">=0.13" - -[package.extras] -dev = ["twine", "wheel", "yapf"] +mpmath = ">=0.19" [[package]] name = "toml" @@ -2348,13 +2806,13 @@ files = [ [[package]] name = "urllib3" -version = "1.26.18" +version = "1.26.19" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"}, - {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"}, + {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, + {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, ] [package.extras] @@ -2378,15 +2836,96 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"] optional = ["python-socks", "wsaccel"] test = ["websockets"] +[[package]] +name = "websockets" +version = "12.0" +description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"}, + {file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"}, + {file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"}, + {file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"}, + {file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"}, + {file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"}, + {file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"}, + {file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"}, + {file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"}, + {file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"}, + {file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"}, + {file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"}, + {file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"}, + {file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"}, + {file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"}, + {file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"}, + {file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"}, + {file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"}, + {file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"}, + {file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"}, + {file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"}, + {file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"}, + {file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"}, + {file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"}, + {file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"}, + {file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"}, + {file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"}, + {file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"}, + {file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"}, + {file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"}, + {file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"}, + {file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"}, + {file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"}, + {file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"}, + {file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"}, + {file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"}, + {file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"}, + {file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"}, + {file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"}, + {file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"}, + {file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"}, + {file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"}, + {file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"}, + {file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"}, + {file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"}, + {file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"}, + {file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"}, + {file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"}, + {file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"}, + {file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"}, + {file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"}, + {file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"}, + {file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"}, + {file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"}, + {file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"}, + {file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"}, + {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"}, + {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"}, + {file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"}, + {file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"}, + {file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"}, + {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"}, + {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"}, + {file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"}, + {file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"}, + {file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"}, + {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"}, + {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"}, + {file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"}, + {file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"}, + {file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"}, + {file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"}, +] + [[package]] name = "werkzeug" -version = "3.0.1" +version = "3.0.3" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.8" files = [ - {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"}, - {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"}, + {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"}, + {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"}, ] [package.dependencies] @@ -2421,6 +2960,16 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, + {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, + {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, + {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, + {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, + {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -2658,4 +3207,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "35c237fe6a9278b2dc65b06ed96bde5afb9e393d52c01b00c59acf1df3a8d482" +content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 5fdfd00a6a..288f7769fe 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -5,12 +5,19 @@ edition.workspace = true license.workspace = true [features] -default = ["testing"] +default = [] testing = [] [dependencies] +ahash.workspace = true anyhow.workspace = true +async-compression.workspace = true async-trait.workspace = true +atomic-take.workspace = true +aws-config.workspace = true +aws-sdk-iam.workspace = true +aws-sigv4.workspace = true +aws-types.workspace = true base64.workspace = true bstr.workspace = true bytes = { workspace = true, features = ["serde"] } @@ -18,7 +25,10 @@ camino.workspace = true chrono.workspace = true clap.workspace = true consumption_metrics.workspace = true +crossbeam-deque.workspace = true dashmap.workspace = true +env_logger.workspace = true +framed-websockets.workspace = true futures.workspace = true git-version.workspace = true hashbrown.workspace = true @@ -26,19 +36,25 @@ hashlink.workspace = true hex.workspace = true hmac.workspace = true hostname.workspace = true +http.workspace = true humantime.workspace = true -hyper-tungstenite.workspace = true +humantime-serde.workspace = true hyper.workspace = true +hyper1 = { package = "hyper", version = "1.2", features = ["server"] } +hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } +http-body-util = { version = "0.1" } +indexmap.workspace = true ipnet.workspace = true itertools.workspace = true +lasso = { workspace = true, features = ["multi-threaded"] } md5.workspace = true +measured = { workspace = true, features = ["lasso"] } metrics.workspace = true once_cell.workspace = true opentelemetry.workspace = true parking_lot.workspace = true parquet.workspace = true parquet_derive.workspace = true -pbkdf2 = { workspace = true, features = ["simple", "std"] } pin-project-lite.workspace = true postgres_backend.workspace = true pq_proto.workspace = true @@ -46,8 +62,8 @@ prometheus.workspace = true rand.workspace = true regex.workspace = true remote_storage = { version = "0.1", path = "../libs/remote_storage/" } -reqwest = { workspace = true, features = ["json"] } -reqwest-middleware.workspace = true +reqwest.workspace = true +reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true routerify.workspace = true @@ -57,34 +73,43 @@ rustls.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true -sha2.workspace = true +sha2 = { workspace = true, features = ["asm"] } +smol_str.workspace = true +smallvec.workspace = true socket2.workspace = true -sync_wrapper.workspace = true +subtle.workspace = true task-local-extensions.workspace = true thiserror.workspace = true -tls-listener.workspace = true +tikv-jemallocator.workspace = true +tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } tokio-postgres.workspace = true +tokio-postgres-rustls.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } +tower-service.workspace = true tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true url.workspace = true +urlencoding.workspace = true utils.workspace = true uuid.workspace = true -webpki-roots.workspace = true +rustls-native-certs.workspace = true x509-parser.workspace = true -native-tls.workspace = true -postgres-native-tls.workspace = true postgres-protocol.workspace = true -smol_str.workspace = true +redis.workspace = true workspace_hack.workspace = true [dev-dependencies] camino-tempfile.workspace = true +fallible-iterator.workspace = true +tokio-tungstenite.workspace = true +pbkdf2 = { workspace = true, features = ["simple", "std"] } rcgen.workspace = true rstest.workspace = true tokio-postgres-rustls.workspace = true +walkdir.workspace = true +rand_distr = "0.4" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 64ef108e11..8c44823c98 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -4,7 +4,10 @@ pub mod backend; pub use backend::BackendType; mod credentials; -pub use credentials::{check_peer_addr_is_in_list, ClientCredentials}; +pub use credentials::{ + check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, + ComputeUserInfoParseError, IpPattern, +}; mod password_hack; pub use password_hack::parse_endpoint_param; @@ -12,9 +15,13 @@ use password_hack::PasswordHackPayload; mod flow; pub use flow::*; +use tokio::time::error::Elapsed; -use crate::{console, error::UserFacingError}; -use std::io; +use crate::{ + console, + error::{ReportableError, UserFacingError}, +}; +use std::{io, net::IpAddr}; use thiserror::Error; /// Convenience wrapper for the authentication error. @@ -29,9 +36,6 @@ pub enum AuthErrorImpl { #[error(transparent)] GetAuthInfo(#[from] console::errors::GetAuthInfoError), - #[error(transparent)] - WakeCompute(#[from] console::errors::WakeComputeError), - /// SASL protocol errors (includes [SCRAM](crate::scram)). #[error(transparent)] Sasl(#[from] crate::sasl::Error), @@ -58,13 +62,17 @@ pub enum AuthErrorImpl { Io(#[from] io::Error), #[error( - "This IP address is not allowed to connect to this endpoint. \ - Please add it to the allowed list in the Neon console." + "This IP address {0} is not allowed to connect to this endpoint. \ + Please add it to the allowed list in the Neon console. \ + Make sure to check for IPv4 or IPv6 addresses." )] - IpAddressNotAllowed, + IpAddressNotAllowed(IpAddr), #[error("Too many connections to this endpoint. Please try again later.")] TooManyConnections, + + #[error("Authentication timed out")] + UserTimeout(Elapsed), } #[derive(Debug, Error)] @@ -80,8 +88,8 @@ impl AuthError { AuthErrorImpl::AuthFailed(user.into()).into() } - pub fn ip_address_not_allowed() -> Self { - AuthErrorImpl::IpAddressNotAllowed.into() + pub fn ip_address_not_allowed(ip: IpAddr) -> Self { + AuthErrorImpl::IpAddressNotAllowed(ip).into() } pub fn too_many_connections() -> Self { @@ -91,6 +99,10 @@ impl AuthError { pub fn is_auth_failed(&self) -> bool { matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) } + + pub fn user_timeout(elapsed: Elapsed) -> Self { + AuthErrorImpl::UserTimeout(elapsed).into() + } } impl> From for AuthError { @@ -105,15 +117,34 @@ impl UserFacingError for AuthError { match self.0.as_ref() { Link(e) => e.to_string_client(), GetAuthInfo(e) => e.to_string_client(), - WakeCompute(e) => e.to_string_client(), Sasl(e) => e.to_string_client(), AuthFailed(_) => self.to_string(), BadAuthMethod(_) => self.to_string(), MalformedPassword(_) => self.to_string(), MissingEndpointName => self.to_string(), Io(_) => "Internal error".to_string(), - IpAddressNotAllowed => self.to_string(), + IpAddressNotAllowed(_) => self.to_string(), TooManyConnections => self.to_string(), + UserTimeout(_) => self.to_string(), + } + } +} + +impl ReportableError for AuthError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + use AuthErrorImpl::*; + match self.0.as_ref() { + Link(e) => e.get_error_kind(), + GetAuthInfo(e) => e.get_error_kind(), + Sasl(e) => e.get_error_kind(), + AuthFailed(_) => crate::error::ErrorKind::User, + BadAuthMethod(_) => crate::error::ErrorKind::User, + MalformedPassword(_) => crate::error::ErrorKind::User, + MissingEndpointName => crate::error::ErrorKind::User, + Io(_) => crate::error::ErrorKind::ClientDisconnect, + IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + TooManyConnections => crate::error::ErrorKind::RateLimit, + UserTimeout(_) => crate::error::ErrorKind::User, } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index d9bddff139..f757a15fbb 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -2,162 +2,273 @@ mod classic; mod hacks; mod link; +use std::net::IpAddr; +use std::sync::Arc; +use std::time::Duration; + +use ipnet::{Ipv4Net, Ipv6Net}; pub use link::LinkAuthError; -use smol_str::SmolStr; +use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::AuthKeys; +use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::validate_password_and_exchange; +use crate::auth::{validate_password_and_exchange, AuthError}; +use crate::cache::Cached; use crate::console::errors::GetAuthInfoError; -use crate::console::AuthSecret; +use crate::console::provider::{CachedRoleSecret, ConsoleBackend}; +use crate::console::{AuthSecret, NodeInfo}; use crate::context::RequestMonitoring; -use crate::proxy::connect_compute::handle_try_wake; -use crate::proxy::retry::retry_after; -use crate::scram; +use crate::intern::EndpointIdInt; +use crate::metrics::Metrics; +use crate::proxy::connect_compute::ComputeConnectBackend; +use crate::proxy::NeonOptions; +use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; use crate::{ - auth::{self, ClientCredentials}, + auth::{self, ComputeUserInfoMaybeEndpoint}, config::AuthenticationConfig, console::{ self, - provider::{CachedNodeInfo, ConsoleReqExtra}, + provider::{CachedAllowedIps, CachedNodeInfo}, Api, }, stream, url, }; -use futures::TryFutureExt; -use std::borrow::Cow; -use std::ops::ControlFlow; -use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{error, info, warn}; +use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; + +/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality +pub enum MaybeOwned<'a, T> { + Owned(T), + Borrowed(&'a T), +} + +impl std::ops::Deref for MaybeOwned<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + match self { + MaybeOwned::Owned(t) => t, + MaybeOwned::Borrowed(t) => t, + } + } +} /// This type serves two purposes: /// /// * When `T` is `()`, it's just a regular auth backend selector /// which we use in [`crate::config::ProxyConfig`]. /// -/// * However, when we substitute `T` with [`ClientCredentials`], +/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -pub enum BackendType<'a, T> { - /// Current Cloud API (V2). - Console(Cow<'a, console::provider::neon::Api>, T), - /// Local mock of Cloud API (V2). - #[cfg(feature = "testing")] - Postgres(Cow<'a, console::provider::mock::Api>, T), +pub enum BackendType<'a, T, D> { + /// Cloud API (V2). + Console(MaybeOwned<'a, ConsoleBackend>, T), /// Authentication via a web browser. - Link(Cow<'a, url::ApiUrl>), - #[cfg(test)] - /// Test backend. - Test(&'a dyn TestBackend), + Link(MaybeOwned<'a, url::ApiUrl>, D), } pub trait TestBackend: Send + Sync + 'static { fn wake_compute(&self) -> Result; - fn get_allowed_ips(&self) -> Result>, console::errors::GetAuthInfoError>; + fn get_allowed_ips_and_secret( + &self, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError>; + fn get_role_secret(&self) -> Result; } -impl std::fmt::Display for BackendType<'_, ()> { +impl std::fmt::Display for BackendType<'_, (), ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use BackendType::*; match self { - Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(), - #[cfg(feature = "testing")] - Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(), - Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), - #[cfg(test)] - Test(_) => fmt.debug_tuple("Test").finish(), + Console(api, _) => match &**api { + ConsoleBackend::Console(endpoint) => { + fmt.debug_tuple("Console").field(&endpoint.url()).finish() + } + #[cfg(any(test, feature = "testing"))] + ConsoleBackend::Postgres(endpoint) => { + fmt.debug_tuple("Postgres").field(&endpoint.url()).finish() + } + #[cfg(test)] + ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(), + }, + Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), } } } -impl BackendType<'_, T> { +impl BackendType<'_, T, D> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. - pub fn as_ref(&self) -> BackendType<'_, &T> { + pub fn as_ref(&self) -> BackendType<'_, &T, &D> { use BackendType::*; match self { - Console(c, x) => Console(Cow::Borrowed(c), x), - #[cfg(feature = "testing")] - Postgres(c, x) => Postgres(Cow::Borrowed(c), x), - Link(c) => Link(Cow::Borrowed(c)), - #[cfg(test)] - Test(x) => Test(*x), + Console(c, x) => Console(MaybeOwned::Borrowed(c), x), + Link(c, x) => Link(MaybeOwned::Borrowed(c), x), } } } -impl<'a, T> BackendType<'a, T> { +impl<'a, T, D> BackendType<'a, T, D> { /// Very similar to [`std::option::Option::map`]. /// Maps [`BackendType`] to [`BackendType`] by applying /// a function to a contained value. - pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> { + pub fn map(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> { use BackendType::*; match self { Console(c, x) => Console(c, f(x)), - #[cfg(feature = "testing")] - Postgres(c, x) => Postgres(c, f(x)), - Link(c) => Link(c), - #[cfg(test)] - Test(x) => Test(x), + Link(c, x) => Link(c, x), } } } - -impl<'a, T, E> BackendType<'a, Result> { +impl<'a, T, D, E> BackendType<'a, Result, D> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub fn transpose(self) -> Result, E> { + pub fn transpose(self) -> Result, E> { use BackendType::*; match self { Console(c, x) => x.map(|x| Console(c, x)), - #[cfg(feature = "testing")] - Postgres(c, x) => x.map(|x| Postgres(c, x)), - Link(c) => Ok(Link(c)), - #[cfg(test)] - Test(x) => Ok(Test(x)), + Link(c, x) => Ok(Link(c, x)), } } } -pub struct ComputeCredentials { +pub struct ComputeCredentials { pub info: ComputeUserInfo, - pub keys: T, + pub keys: ComputeCredentialKeys, } +#[derive(Debug, Clone)] pub struct ComputeUserInfoNoEndpoint { - pub user: SmolStr, - pub cache_key: SmolStr, + pub user: RoleName, + pub options: NeonOptions, } +#[derive(Debug, Clone)] pub struct ComputeUserInfo { - pub endpoint: SmolStr, - pub inner: ComputeUserInfoNoEndpoint, + pub endpoint: EndpointId, + pub user: RoleName, + pub options: NeonOptions, +} + +impl ComputeUserInfo { + pub fn endpoint_cache_key(&self) -> EndpointCacheKey { + self.options.get_cache_key(&self.endpoint) + } } pub enum ComputeCredentialKeys { - #[cfg(feature = "testing")] Password(Vec), AuthKeys(AuthKeys), } -impl TryFrom for ComputeUserInfo { +impl TryFrom for ComputeUserInfo { // user name type Error = ComputeUserInfoNoEndpoint; - fn try_from(creds: ClientCredentials) -> Result { - let inner = ComputeUserInfoNoEndpoint { - user: creds.user, - cache_key: creds.cache_key, - }; - match creds.project { - None => Err(inner), - Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }), + fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result { + match user_info.endpoint_id { + None => Err(ComputeUserInfoNoEndpoint { + user: user_info.user, + options: user_info.options, + }), + Some(endpoint) => Ok(ComputeUserInfo { + endpoint, + user: user_info.user, + options: user_info.options, + }), } } } +#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)] +pub struct MaskedIp(IpAddr); + +impl MaskedIp { + fn new(value: IpAddr, prefix: u8) -> Self { + match value { + IpAddr::V4(v4) => Self(IpAddr::V4( + Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()), + )), + IpAddr::V6(v6) => Self(IpAddr::V6( + Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()), + )), + } + } +} + +// This can't be just per IP because that would limit some PaaS that share IP addresses +pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>; + +impl RateBucketInfo { + /// All of these are per endpoint-maskedip pair. + /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus). + /// + /// First bucket: 1000mcpus total per endpoint-ip pair + /// * 4096000 requests per second with 1 hash rounds. + /// * 1000 requests per second with 4096 hash rounds. + /// * 6.8 requests per second with 600000 hash rounds. + pub const DEFAULT_AUTH_SET: [Self; 3] = [ + Self::new(1000 * 4096, Duration::from_secs(1)), + Self::new(600 * 4096, Duration::from_secs(60)), + Self::new(300 * 4096, Duration::from_secs(600)), + ]; +} + +impl AuthenticationConfig { + pub fn check_rate_limit( + &self, + ctx: &mut RequestMonitoring, + config: &AuthenticationConfig, + secret: AuthSecret, + endpoint: &EndpointId, + is_cleartext: bool, + ) -> auth::Result { + // we have validated the endpoint exists, so let's intern it. + let endpoint_int = EndpointIdInt::from(endpoint.normalize()); + + // only count the full hash count if password hack or websocket flow. + // in other words, if proxy needs to run the hashing + let password_weight = if is_cleartext { + match &secret { + #[cfg(any(test, feature = "testing"))] + AuthSecret::Md5(_) => 1, + AuthSecret::Scram(s) => s.iterations + 1, + } + } else { + // validating scram takes just 1 hmac_sha_256 operation. + 1 + }; + + let limit_not_exceeded = self.rate_limiter.check( + ( + endpoint_int, + MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet), + ), + password_weight, + ); + + if !limit_not_exceeded { + warn!( + enabled = self.rate_limiter_enabled, + "rate limiting authentication" + ); + Metrics::get().proxy.requests_auth_rate_limits_total.inc(); + Metrics::get() + .proxy + .endpoints_auth_rate_limits + .get_metric() + .measure(endpoint); + + if self.rate_limiter_enabled { + return Err(auth::AuthError::too_many_connections()); + } + } + + Ok(secret) + } +} + /// True to its name, this function encapsulates our current auth trade-offs. /// Here, we choose the appropriate auth flow based on circumstances. /// @@ -165,40 +276,63 @@ impl TryFrom for ComputeUserInfo { async fn auth_quirks( ctx: &mut RequestMonitoring, api: &impl console::Api, - creds: ClientCredentials, + user_info: ComputeUserInfoMaybeEndpoint, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, -) -> auth::Result> { + endpoint_rate_limiter: Arc, +) -> auth::Result { // If there's no project so far, that entails that client doesn't // support SNI or other means of passing the endpoint (project) name. // We now expect to see a very specific payload in the place of password. - let (info, unauthenticated_password) = match creds.try_into() { + let (info, unauthenticated_password) = match user_info.try_into() { Err(info) => { - let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer) - .await?; - ctx.set_endpoint_id(Some(res.info.endpoint.clone())); - (res.info, Some(res.keys)) + let res = hacks::password_hack_no_authentication(ctx, info, client).await?; + + ctx.set_endpoint_id(res.info.endpoint.clone()); + let password = match res.keys { + ComputeCredentialKeys::Password(p) => p, + _ => unreachable!("password hack should return a password"), + }; + (res.info, Some(password)) } Ok(info) => (info, None), }; info!("fetching user's authentication info"); - let allowed_ips = api.get_allowed_ips(ctx, &info).await?; + let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?; // check allowed list if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed()); + return Err(auth::AuthError::ip_address_not_allowed(ctx.peer_addr)); } - let cached_secret = api.get_role_secret(ctx, &info).await?; - let secret = cached_secret.clone().unwrap_or_else(|| { - // If we don't have an authentication secret, we mock one to - // prevent malicious probing (possible due to missing protocol steps). - // This mocked secret will never lead to successful authentication. - info!("authentication info not found, mocking it"); - AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random())) - }); + if !endpoint_rate_limiter.check(info.endpoint.clone().into(), 1) { + return Err(AuthError::too_many_connections()); + } + let cached_secret = match maybe_secret { + Some(secret) => secret, + None => api.get_role_secret(ctx, &info).await?, + }; + let (cached_entry, secret) = cached_secret.take_value(); + + let secret = match secret { + Some(secret) => config.check_rate_limit( + ctx, + config, + secret, + &info.endpoint, + unauthenticated_password.is_some() || allow_cleartext, + )?, + None => { + // If we don't have an authentication secret, we mock one to + // prevent malicious probing (possible due to missing protocol steps). + // This mocked secret will never lead to successful authentication. + info!("authentication info not found, mocking it"); + AuthSecret::Scram(scram::ServerSecret::mock(rand::random())) + } + }; + match authenticate_with_secret( ctx, secret, @@ -214,7 +348,7 @@ async fn auth_quirks( Err(e) => { if e.is_auth_failed() { // The password could have been changed, so we invalidate the cache. - cached_secret.invalidate(); + cached_entry.invalidate(); } Err(e) } @@ -229,14 +363,17 @@ async fn authenticate_with_secret( unauthenticated_password: Option>, allow_cleartext: bool, config: &'static AuthenticationConfig, -) -> auth::Result> { +) -> auth::Result { if let Some(password) = unauthenticated_password { - let auth_outcome = validate_password_and_exchange(&password, secret)?; + let ep = EndpointIdInt::from(&info.endpoint); + + let auth_outcome = + validate_password_and_exchange(&config.thread_pool, ep, &password, secret).await?; let keys = match auth_outcome { crate::sasl::Outcome::Success(key) => key, crate::sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); - return Err(auth::AuthError::auth_failed(&*info.inner.user)); + return Err(auth::AuthError::auth_failed(&*info.user)); } }; @@ -251,70 +388,22 @@ async fn authenticate_with_secret( // Perform cleartext auth if we're allowed to do that. // Currently, we use it for websocket connections (latency). if allow_cleartext { - return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await; + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); + return hacks::authenticate_cleartext(ctx, info, client, secret, config).await; } // Finally, proceed with the main auth flow (SCRAM-based). - classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await + classic::authenticate(ctx, info, client, config, secret).await } -/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache) -/// only if authentication was successfuly. -async fn auth_and_wake_compute( - ctx: &mut RequestMonitoring, - api: &impl console::Api, - extra: &ConsoleReqExtra, - creds: ClientCredentials, - client: &mut stream::PqStream>, - allow_cleartext: bool, - config: &'static AuthenticationConfig, -) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> { - let compute_credentials = auth_quirks(ctx, api, creds, client, allow_cleartext, config).await?; - - let mut num_retries = 0; - let mut node = loop { - let wake_res = api - .wake_compute(ctx, extra, &compute_credentials.info) - .await; - match handle_try_wake(wake_res, num_retries) { - Err(e) => { - error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); - return Err(e.into()); - } - Ok(ControlFlow::Continue(e)) => { - warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); - } - Ok(ControlFlow::Break(n)) => break n, - } - - let wait_duration = retry_after(num_retries); - num_retries += 1; - tokio::time::sleep(wait_duration).await; - }; - - ctx.set_project(node.aux.clone()); - - match compute_credentials.keys { - #[cfg(feature = "testing")] - ComputeCredentialKeys::Password(password) => node.config.password(password), - ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys), - }; - - Ok((node, compute_credentials.info)) -} - -impl<'a> BackendType<'a, ClientCredentials> { +impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> { /// Get compute endpoint name from the credentials. - pub fn get_endpoint(&self) -> Option { + pub fn get_endpoint(&self) -> Option { use BackendType::*; match self { - Console(_, creds) => creds.project.clone(), - #[cfg(feature = "testing")] - Postgres(_, creds) => creds.project.clone(), - Link(_) => Some("link".into()), - #[cfg(test)] - Test(_) => Some("test".into()), + Console(_, user_info) => user_info.endpoint_id.clone(), + Link(_, _) => Some("link".into()), } } @@ -323,12 +412,8 @@ impl<'a> BackendType<'a, ClientCredentials> { use BackendType::*; match self { - Console(_, creds) => &creds.user, - #[cfg(feature = "testing")] - Postgres(_, creds) => &creds.user, - Link(_) => "link", - #[cfg(test)] - Test(_) => "test", + Console(_, user_info) => &user_info.user, + Link(_, _) => "link", } } @@ -337,67 +422,40 @@ impl<'a> BackendType<'a, ClientCredentials> { pub async fn authenticate( self, ctx: &mut RequestMonitoring, - extra: &ConsoleReqExtra, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, - ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> { + endpoint_rate_limiter: Arc, + ) -> auth::Result> { use BackendType::*; let res = match self { - Console(api, creds) => { + Console(api, user_info) => { info!( - user = &*creds.user, - project = creds.project(), + user = &*user_info.user, + project = user_info.endpoint(), "performing authentication using the console" ); - let (cache_info, user_info) = auth_and_wake_compute( + let credentials = auth_quirks( ctx, &*api, - extra, - creds, + user_info, client, allow_cleartext, config, + endpoint_rate_limiter, ) .await?; - (cache_info, BackendType::Console(api, user_info)) - } - #[cfg(feature = "testing")] - Postgres(api, creds) => { - info!( - user = &*creds.user, - project = creds.project(), - "performing authentication using a local postgres instance" - ); - - let (cache_info, user_info) = auth_and_wake_compute( - ctx, - &*api, - extra, - creds, - client, - allow_cleartext, - config, - ) - .await?; - (cache_info, BackendType::Postgres(api, user_info)) + BackendType::Console(api, credentials) } // NOTE: this auth backend doesn't use client credentials. - Link(url) => { + Link(url, _) => { info!("performing link authentication"); - let node_info = link::authenticate(&url, client).await?; + let info = link::authenticate(ctx, &url, client).await?; - ( - CachedNodeInfo::new_uncached(node_info), - BackendType::Link(url), - ) - } - #[cfg(test)] - Test(_) => { - unreachable!("this function should never be called in the test backend") + BackendType::Link(url, info) } }; @@ -406,38 +464,377 @@ impl<'a> BackendType<'a, ClientCredentials> { } } -impl BackendType<'_, ComputeUserInfo> { - pub async fn get_allowed_ips( +impl BackendType<'_, ComputeUserInfo, &()> { + pub async fn get_role_secret( &self, ctx: &mut RequestMonitoring, - ) -> Result>, GetAuthInfoError> { + ) -> Result { use BackendType::*; match self { - Console(api, creds) => api.get_allowed_ips(ctx, creds).await, - #[cfg(feature = "testing")] - Postgres(api, creds) => api.get_allowed_ips(ctx, creds).await, - Link(_) => Ok(Arc::new(vec![])), - #[cfg(test)] - Test(x) => x.get_allowed_ips(), + Console(api, user_info) => api.get_role_secret(ctx, user_info).await, + Link(_, _) => Ok(Cached::new_uncached(None)), } } - /// When applicable, wake the compute node, gaining its connection info in the process. - /// The link auth flow doesn't support this, so we return [`None`] in that case. - pub async fn wake_compute( + pub async fn get_allowed_ips_and_secret( &self, ctx: &mut RequestMonitoring, - extra: &ConsoleReqExtra, - ) -> Result, console::errors::WakeComputeError> { + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { use BackendType::*; - match self { - Console(api, creds) => api.wake_compute(ctx, extra, creds).map_ok(Some).await, - #[cfg(feature = "testing")] - Postgres(api, creds) => api.wake_compute(ctx, extra, creds).map_ok(Some).await, - Link(_) => Ok(None), - #[cfg(test)] - Test(x) => x.wake_compute().map(Some), + Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await, + Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), } } } + +#[async_trait::async_trait] +impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> { + async fn wake_compute( + &self, + ctx: &mut RequestMonitoring, + ) -> Result { + use BackendType::*; + + match self { + Console(api, creds) => api.wake_compute(ctx, &creds.info).await, + Link(_, info) => Ok(Cached::new_uncached(info.clone())), + } + } + + fn get_keys(&self) -> Option<&ComputeCredentialKeys> { + match self { + BackendType::Console(_, creds) => Some(&creds.keys), + BackendType::Link(_, _) => None, + } + } +} + +#[async_trait::async_trait] +impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> { + async fn wake_compute( + &self, + ctx: &mut RequestMonitoring, + ) -> Result { + use BackendType::*; + + match self { + Console(api, creds) => api.wake_compute(ctx, &creds.info).await, + Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"), + } + } + + fn get_keys(&self) -> Option<&ComputeCredentialKeys> { + match self { + BackendType::Console(_, creds) => Some(&creds.keys), + BackendType::Link(_, _) => None, + } + } +} + +#[cfg(test)] +mod tests { + use std::{net::IpAddr, sync::Arc, time::Duration}; + + use bytes::BytesMut; + use fallible_iterator::FallibleIterator; + use once_cell::sync::Lazy; + use postgres_protocol::{ + authentication::sasl::{ChannelBinding, ScramSha256}, + message::{backend::Message as PgMessage, frontend}, + }; + use provider::AuthSecret; + use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; + + use crate::{ + auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, + config::AuthenticationConfig, + console::{ + self, + provider::{self, CachedAllowedIps, CachedRoleSecret}, + CachedNodeInfo, + }, + context::RequestMonitoring, + proxy::NeonOptions, + rate_limiter::{EndpointRateLimiter, RateBucketInfo}, + scram::{threadpool::ThreadPool, ServerSecret}, + stream::{PqStream, Stream}, + }; + + use super::{auth_quirks, AuthRateLimiter}; + + struct Auth { + ips: Vec, + secret: AuthSecret, + } + + impl console::Api for Auth { + async fn get_role_secret( + &self, + _ctx: &mut RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result { + Ok(CachedRoleSecret::new_uncached(Some(self.secret.clone()))) + } + + async fn get_allowed_ips_and_secret( + &self, + _ctx: &mut RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> + { + Ok(( + CachedAllowedIps::new_uncached(Arc::new(self.ips.clone())), + Some(CachedRoleSecret::new_uncached(Some(self.secret.clone()))), + )) + } + + async fn wake_compute( + &self, + _ctx: &mut RequestMonitoring, + _user_info: &super::ComputeUserInfo, + ) -> Result { + unimplemented!() + } + } + + static CONFIG: Lazy = Lazy::new(|| AuthenticationConfig { + thread_pool: ThreadPool::new(1), + scram_protocol_timeout: std::time::Duration::from_secs(5), + rate_limiter_enabled: true, + rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), + rate_limit_ip_subnet: 64, + }); + + async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { + loop { + r.read_buf(&mut *b).await.unwrap(); + if let Some(m) = PgMessage::parse(&mut *b).unwrap() { + break m; + } + } + } + + #[test] + fn masked_ip() { + let ip_a = IpAddr::V4([127, 0, 0, 1].into()); + let ip_b = IpAddr::V4([127, 0, 0, 2].into()); + let ip_c = IpAddr::V4([192, 168, 1, 101].into()); + let ip_d = IpAddr::V4([192, 168, 1, 102].into()); + let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap()); + let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap()); + + assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64)); + assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32)); + assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30)); + assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30)); + + assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128)); + assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64)); + } + + #[test] + fn test_default_auth_rate_limit_set() { + // these values used to exceed u32::MAX + assert_eq!( + RateBucketInfo::DEFAULT_AUTH_SET, + [ + RateBucketInfo { + interval: Duration::from_secs(1), + max_rpi: 1000 * 4096, + }, + RateBucketInfo { + interval: Duration::from_secs(60), + max_rpi: 600 * 4096 * 60, + }, + RateBucketInfo { + interval: Duration::from_secs(600), + max_rpi: 300 * 4096 * 600, + } + ] + ); + + for x in RateBucketInfo::DEFAULT_AUTH_SET { + let y = x.to_string().parse().unwrap(); + assert_eq!(x, y); + } + } + + #[tokio::test] + async fn auth_quirks_scram() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let mut ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: Some("endpoint".into()), + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut scram = ScramSha256::new(b"my-secret-password", ChannelBinding::unsupported()); + + let mut read = BytesMut::new(); + + // server should offer scram + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSasl(a) => { + let options: Vec<&str> = a.mechanisms().collect().unwrap(); + assert_eq!(options, ["SCRAM-SHA-256"]); + } + _ => panic!("wrong message"), + } + + // client sends client-first-message + let mut write = BytesMut::new(); + frontend::sasl_initial_response("SCRAM-SHA-256", scram.message(), &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + + // server response with server-first-message + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSaslContinue(a) => { + scram.update(a.data()).await.unwrap(); + } + _ => panic!("wrong message"), + } + + // client response with client-final-message + write.clear(); + frontend::sasl_response(scram.message(), &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + + // server response with server-final-message + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationSaslFinal(a) => { + scram.finish(a.data()).unwrap(); + } + _ => panic!("wrong message"), + } + }); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + false, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); + + handle.await.unwrap(); + } + + #[tokio::test] + async fn auth_quirks_cleartext() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let mut ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: Some("endpoint".into()), + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut read = BytesMut::new(); + let mut write = BytesMut::new(); + + // server should offer cleartext + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationCleartextPassword => {} + _ => panic!("wrong message"), + } + + // client responds with password + write.clear(); + frontend::password_message(b"my-secret-password", &mut write).unwrap(); + client.write_all(&write).await.unwrap(); + }); + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let _creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); + + handle.await.unwrap(); + } + + #[tokio::test] + async fn auth_quirks_password_hack() { + let (mut client, server) = tokio::io::duplex(1024); + let mut stream = PqStream::new(Stream::from_raw(server)); + + let mut ctx = RequestMonitoring::test(); + let api = Auth { + ips: vec![], + secret: AuthSecret::Scram(ServerSecret::build("my-secret-password").await.unwrap()), + }; + + let user_info = ComputeUserInfoMaybeEndpoint { + user: "conrad".into(), + endpoint_id: None, + options: NeonOptions::default(), + }; + + let handle = tokio::spawn(async move { + let mut read = BytesMut::new(); + + // server should offer cleartext + match read_message(&mut client, &mut read).await { + PgMessage::AuthenticationCleartextPassword => {} + _ => panic!("wrong message"), + } + + // client responds with password + let mut write = BytesMut::new(); + frontend::password_message(b"endpoint=my-endpoint;my-secret-password", &mut write) + .unwrap(); + client.write_all(&write).await.unwrap(); + }); + + let endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET)); + + let creds = auth_quirks( + &mut ctx, + &api, + user_info, + &mut stream, + true, + &CONFIG, + endpoint_rate_limiter, + ) + .await + .unwrap(); + + assert_eq!(creds.info.endpoint, "my-endpoint"); + + handle.await.unwrap(); + } +} diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 5c394ec649..b98fa63120 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -4,7 +4,7 @@ use crate::{ compute, config::AuthenticationConfig, console::AuthSecret, - metrics::LatencyTimer, + context::RequestMonitoring, sasl, stream::{PqStream, Stream}, }; @@ -12,28 +12,26 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; pub(super) async fn authenticate( + ctx: &mut RequestMonitoring, creds: ComputeUserInfo, client: &mut PqStream>, config: &'static AuthenticationConfig, - latency_timer: &mut LatencyTimer, secret: AuthSecret, -) -> auth::Result> { +) -> auth::Result { let flow = AuthFlow::new(client); let scram_keys = match secret { - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] AuthSecret::Md5(_) => { info!("auth endpoint chooses MD5"); return Err(auth::AuthError::bad_auth_method("MD5")); } AuthSecret::Scram(secret) => { info!("auth endpoint chooses SCRAM"); - let scram = auth::Scram(&secret); + let scram = auth::Scram(&secret, &mut *ctx); let auth_outcome = tokio::time::timeout( config.scram_protocol_timeout, async { - // pause the timer while we communicate with the client - let _paused = latency_timer.pause(); flow.begin(scram).await.map_err(|error| { warn!(?error, "error sending scram acknowledgement"); @@ -45,16 +43,16 @@ pub(super) async fn authenticate( } ) .await - .map_err(|error| { - warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs()); - auth::io::Error::new(auth::io::ErrorKind::TimedOut, error) + .map_err(|e| { + warn!("error processing scram messages error = authentication timed out, execution time exceeded {} seconds", config.scram_protocol_timeout.as_secs()); + auth::AuthError::user_timeout(e) })??; let client_key = match auth_outcome { sasl::Outcome::Success(key) => key, sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); - return Err(auth::AuthError::auth_failed(&*creds.inner.user)); + return Err(auth::AuthError::auth_failed(&*creds.user)); } }; diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 5dde514bca..6b0f5e1726 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -3,8 +3,10 @@ use super::{ }; use crate::{ auth::{self, AuthFlow}, + config::AuthenticationConfig, console::AuthSecret, - metrics::LatencyTimer, + context::RequestMonitoring, + intern::EndpointIdInt, sasl, stream::{self, Stream}, }; @@ -16,27 +18,38 @@ use tracing::{info, warn}; /// These properties are benefical for serverless JS workers, so we /// use this mechanism for websocket connections. pub async fn authenticate_cleartext( + ctx: &mut RequestMonitoring, info: ComputeUserInfo, client: &mut stream::PqStream>, - latency_timer: &mut LatencyTimer, secret: AuthSecret, -) -> auth::Result> { + config: &'static AuthenticationConfig, +) -> auth::Result { warn!("cleartext auth flow override is enabled, proceeding"); + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let _paused = latency_timer.pause(); + let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); - let auth_outcome = AuthFlow::new(client) - .begin(auth::CleartextPassword(secret)) - .await? - .authenticate() + let ep = EndpointIdInt::from(&info.endpoint); + + let auth_flow = AuthFlow::new(client) + .begin(auth::CleartextPassword { + secret, + endpoint: ep, + pool: config.thread_pool.clone(), + }) .await?; + drop(paused); + // cleartext auth is only allowed to the ws/http protocol. + // If we're here, we already received the password in the first message. + // Scram protocol will be executed on the proxy side. + let auth_outcome = auth_flow.authenticate().await?; let keys = match auth_outcome { sasl::Outcome::Success(key) => key, sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); - return Err(auth::AuthError::auth_failed(&*info.inner.user)); + return Err(auth::AuthError::auth_failed(&*info.user)); } }; @@ -47,14 +60,15 @@ pub async fn authenticate_cleartext( /// Similar to [`authenticate_cleartext`], but there's a specific password format, /// and passwords are not yet validated (we don't know how to validate them!) pub async fn password_hack_no_authentication( + ctx: &mut RequestMonitoring, info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, - latency_timer: &mut LatencyTimer, -) -> auth::Result>> { +) -> auth::Result { warn!("project not specified, resorting to the password hack auth flow"); + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client - let _paused = latency_timer.pause(); + let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); let payload = AuthFlow::new(client) .begin(auth::PasswordHack) @@ -67,9 +81,10 @@ pub async fn password_hack_no_authentication( // Report tentative success; compute node will check the password anyway. Ok(ComputeCredentials { info: ComputeUserInfo { - inner: info, + user: info.user, + options: info.options, endpoint: payload.endpoint, }, - keys: payload.password, + keys: ComputeCredentialKeys::Password(payload.password), }) } diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 2cf7e3acc7..5932e1337c 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -1,7 +1,8 @@ use crate::{ auth, compute, console::{self, provider::NodeInfo}, - error::UserFacingError, + context::RequestMonitoring, + error::{ReportableError, UserFacingError}, stream::PqStream, waiters, }; @@ -13,10 +14,6 @@ use tracing::{info, info_span}; #[derive(Debug, Error)] pub enum LinkAuthError { - /// Authentication error reported by the console. - #[error("Authentication failed: {0}")] - AuthFailed(String), - #[error(transparent)] WaiterRegister(#[from] waiters::RegisterError), @@ -29,10 +26,16 @@ pub enum LinkAuthError { impl UserFacingError for LinkAuthError { fn to_string_client(&self) -> String { - use LinkAuthError::*; + "Internal error".to_string() + } +} + +impl ReportableError for LinkAuthError { + fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - AuthFailed(_) => self.to_string(), - _ => "Internal error".to_string(), + LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service, + LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service, + LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect, } } } @@ -54,27 +57,37 @@ pub fn new_psql_session_id() -> String { } pub(super) async fn authenticate( + ctx: &mut RequestMonitoring, link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result { - let psql_session_id = new_psql_session_id(); + ctx.set_auth_method(crate::context::AuthMethod::Web); + + // registering waiter can fail if we get unlucky with rng. + // just try again. + let (psql_session_id, waiter) = loop { + let psql_session_id = new_psql_session_id(); + + match console::mgmt::get_waiter(&psql_session_id) { + Ok(waiter) => break (psql_session_id, waiter), + Err(_e) => continue, + } + }; + let span = info_span!("link", psql_session_id = &psql_session_id); let greeting = hello_message(link_uri, &psql_session_id); - let db_info = console::mgmt::with_waiter(psql_session_id, |waiter| async { - // Give user a URL to spawn a new database. - info!(parent: &span, "sending the auth URL to the user"); - client - .write_message_noflush(&Be::AuthenticationOk)? - .write_message_noflush(&Be::CLIENT_ENCODING)? - .write_message(&Be::NoticeResponse(&greeting)) - .await?; + // Give user a URL to spawn a new database. + info!(parent: &span, "sending the auth URL to the user"); + client + .write_message_noflush(&Be::AuthenticationOk)? + .write_message_noflush(&Be::CLIENT_ENCODING)? + .write_message(&Be::NoticeResponse(&greeting)) + .await?; - // Wait for web console response (see `mgmt`). - info!(parent: &span, "waiting for console's reply..."); - waiter.await?.map_err(LinkAuthError::AuthFailed) - }) - .await?; + // Wait for web console response (see `mgmt`). + info!(parent: &span, "waiting for console's reply..."); + let db_info = waiter.await.map_err(LinkAuthError::from)?; client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; @@ -87,6 +100,11 @@ pub(super) async fn authenticate( .dbname(&db_info.dbname) .user(&db_info.user); + ctx.set_dbname(db_info.dbname.into()); + ctx.set_user(db_info.user.into()); + ctx.set_project(db_info.aux.clone()); + info!("woken up a compute node"); + // Backwards compatibility. pg_sni_proxy uses "--" in domain names // while direct connections do not. Once we migrate to pg_sni_proxy // everywhere, we can remove this. diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 4ddfa722e1..d06f5614f1 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,18 +1,22 @@ //! User credentials used in authentication. use crate::{ - auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError, - metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str, + auth::password_hack::parse_endpoint_param, + context::RequestMonitoring, + error::{ReportableError, UserFacingError}, + metrics::{Metrics, SniKind}, + proxy::NeonOptions, + serverless::SERVERLESS_DRIVER_SNI, + EndpointId, RoleName, }; use itertools::Itertools; use pq_proto::StartupMessageParams; -use smol_str::SmolStr; -use std::{collections::HashSet, net::IpAddr}; +use std::{collections::HashSet, net::IpAddr, str::FromStr}; use thiserror::Error; use tracing::{info, warn}; #[derive(Debug, Error, PartialEq, Eq, Clone)] -pub enum ClientCredsParseError { +pub enum ComputeUserInfoParseError { #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), @@ -21,7 +25,10 @@ pub enum ClientCredsParseError { SNI ('{}') and project option ('{}').", .domain, .option, )] - InconsistentProjectNames { domain: SmolStr, option: SmolStr }, + InconsistentProjectNames { + domain: EndpointId, + option: EndpointId, + }, #[error( "Common name inferred from SNI ('{}') is not known", @@ -30,49 +37,66 @@ pub enum ClientCredsParseError { UnknownCommonName { cn: String }, #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] - MalformedProjectName(SmolStr), + MalformedProjectName(EndpointId), } -impl UserFacingError for ClientCredsParseError {} +impl UserFacingError for ComputeUserInfoParseError {} + +impl ReportableError for ComputeUserInfoParseError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + crate::error::ErrorKind::User + } +} /// Various client credentials which we use for authentication. /// Note that we don't store any kind of client key or password here. #[derive(Debug, Clone, PartialEq, Eq)] -pub struct ClientCredentials { - pub user: SmolStr, - // TODO: this is a severe misnomer! We should think of a new name ASAP. - pub project: Option, - - pub cache_key: SmolStr, +pub struct ComputeUserInfoMaybeEndpoint { + pub user: RoleName, + pub endpoint_id: Option, + pub options: NeonOptions, } -impl ClientCredentials { +impl ComputeUserInfoMaybeEndpoint { #[inline] - pub fn project(&self) -> Option<&str> { - self.project.as_deref() + pub fn endpoint(&self) -> Option<&str> { + self.endpoint_id.as_deref() } } -impl ClientCredentials { +pub fn endpoint_sni( + sni: &str, + common_names: &HashSet, +) -> Result, ComputeUserInfoParseError> { + let Some((subdomain, common_name)) = sni.split_once('.') else { + return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() }); + }; + if !common_names.contains(common_name) { + return Err(ComputeUserInfoParseError::UnknownCommonName { + cn: common_name.into(), + }); + } + if subdomain == SERVERLESS_DRIVER_SNI { + return Ok(None); + } + Ok(Some(EndpointId::from(subdomain))) +} + +impl ComputeUserInfoMaybeEndpoint { pub fn parse( ctx: &mut RequestMonitoring, params: &StartupMessageParams, sni: Option<&str>, - common_names: Option>, - ) -> Result { - use ClientCredsParseError::*; + common_names: Option<&HashSet>, + ) -> Result { + use ComputeUserInfoParseError::*; // Some parameters are stored in the startup message. let get_param = |key| params.get(key).ok_or(MissingKey(key)); - let user: SmolStr = get_param("user")?.into(); - - // record the values if we have them - ctx.set_application(params.get("application_name").map(SmolStr::from)); - ctx.set_user(user.clone()); - ctx.set_endpoint_id(sni.map(SmolStr::from)); + let user: RoleName = get_param("user")?.into(); // Project name might be passed via PG's command-line options. - let project_option = params + let endpoint_option = params .options_raw() .and_then(|options| { // We support both `project` (deprecated) and `endpoint` options for backward compatibility. @@ -85,23 +109,9 @@ impl ClientCredentials { }) .map(|name| name.into()); - let project_from_domain = if let Some(sni_str) = sni { + let endpoint_from_domain = if let Some(sni_str) = sni { if let Some(cn) = common_names { - let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain); - - let project = common_name_from_sni - .and_then(|domain| { - if cn.contains(domain) { - subdomain_from_sni(sni_str, domain) - } else { - None - } - }) - .ok_or_else(|| UnknownCommonName { - cn: common_name_from_sni.unwrap_or("").into(), - })?; - - Some(project) + endpoint_sni(sni_str, cn)? } else { None } @@ -109,76 +119,97 @@ impl ClientCredentials { None }; - let project = match (project_option, project_from_domain) { + let endpoint = match (endpoint_option, endpoint_from_domain) { // Invariant: if we have both project name variants, they should match. (Some(option), Some(domain)) if option != domain => { Some(Err(InconsistentProjectNames { domain, option })) } // Invariant: project name may not contain certain characters. - (a, b) => a.or(b).map(|name| match project_name_valid(&name) { + (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) { false => Err(MalformedProjectName(name)), true => Ok(name), }), } .transpose()?; - info!(%user, project = project.as_deref(), "credentials"); + if let Some(ep) = &endpoint { + ctx.set_endpoint_id(ep.clone()); + } + + let metrics = Metrics::get(); + info!(%user, "credentials"); if sni.is_some() { info!("Connection with sni"); - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["sni"]) - .inc(); - } else if project.is_some() { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["no_sni"]) - .inc(); + metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); + } else if endpoint.is_some() { + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::NoSni); info!("Connection without sni"); } else { - NUM_CONNECTION_ACCEPTED_BY_SNI - .with_label_values(&["password_hack"]) - .inc(); + metrics + .proxy + .accepted_connections_by_sni + .inc(SniKind::PasswordHack); info!("Connection with password hack"); } - let cache_key = format!( - "{}{}", - project.as_deref().unwrap_or(""), - neon_options_str(params) - ) - .into(); + let options = NeonOptions::parse_params(params); Ok(Self { user, - project, - cache_key, + endpoint_id: endpoint, + options, }) } } -pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec) -> bool { - if ip_list.is_empty() { - return true; - } - for ip in ip_list { - // We expect that all ip addresses from control plane are correct. - // However, if some of them are broken, we still can check the others. - match parse_ip_pattern(ip) { - Ok(pattern) => { - if check_ip(peer_addr, &pattern) { - return true; - } - } - Err(err) => warn!("Cannot parse ip: {}; err: {}", ip, err), - } - } - false +pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool { + ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern)) } #[derive(Debug, Clone, Eq, PartialEq)] -enum IpPattern { +pub enum IpPattern { Subnet(ipnet::IpNet), Range(IpAddr, IpAddr), Single(IpAddr), + None, +} + +impl<'de> serde::de::Deserialize<'de> for IpPattern { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct StrVisitor; + impl<'de> serde::de::Visitor<'de> for StrVisitor { + type Value = IpPattern; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(parse_ip_pattern(v).unwrap_or_else(|e| { + warn!("Cannot parse ip pattern {v}: {e}"); + IpPattern::None + })) + } + } + deserializer.deserialize_str(StrVisitor) + } +} + +impl FromStr for IpPattern { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + parse_ip_pattern(s) + } } fn parse_ip_pattern(pattern: &str) -> anyhow::Result { @@ -200,6 +231,7 @@ fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool { IpPattern::Subnet(subnet) => subnet.contains(ip), IpPattern::Range(start, end) => start <= ip && ip <= end, IpPattern::Single(addr) => addr == ip, + IpPattern::None => false, } } @@ -207,25 +239,20 @@ fn project_name_valid(name: &str) -> bool { name.chars().all(|c| c.is_alphanumeric() || c == '-') } -fn subdomain_from_sni(sni: &str, common_name: &str) -> Option { - sni.strip_suffix(common_name)? - .strip_suffix('.') - .map(SmolStr::from) -} - #[cfg(test)] mod tests { use super::*; - use ClientCredsParseError::*; + use serde_json::json; + use ComputeUserInfoParseError::*; #[test] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project, None); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.endpoint_id, None); Ok(()) } @@ -238,9 +265,9 @@ mod tests { ("foo", "bar"), // should be ignored ]); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project, None); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.endpoint_id, None); Ok(()) } @@ -253,10 +280,11 @@ mod tests { let common_names = Some(["localhost".into()].into()); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project.as_deref(), Some("foo")); - assert_eq!(creds.cache_key, "foo"); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.endpoint_id.as_deref(), Some("foo")); + assert_eq!(user_info.options.get_cache_key("foo"), "foo"); Ok(()) } @@ -269,9 +297,9 @@ mod tests { ]); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project.as_deref(), Some("bar")); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); Ok(()) } @@ -284,9 +312,9 @@ mod tests { ]); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project.as_deref(), Some("bar")); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.endpoint_id.as_deref(), Some("bar")); Ok(()) } @@ -302,9 +330,9 @@ mod tests { ]); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?; - assert_eq!(creds.user, "john_doe"); - assert!(creds.project.is_none()); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert!(user_info.endpoint_id.is_none()); Ok(()) } @@ -317,9 +345,9 @@ mod tests { ]); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?; - assert_eq!(creds.user, "john_doe"); - assert!(creds.project.is_none()); + let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?; + assert_eq!(user_info.user, "john_doe"); + assert!(user_info.endpoint_id.is_none()); Ok(()) } @@ -332,9 +360,10 @@ mod tests { let common_names = Some(["localhost".into()].into()); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?; - assert_eq!(creds.user, "john_doe"); - assert_eq!(creds.project.as_deref(), Some("baz")); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.user, "john_doe"); + assert_eq!(user_info.endpoint_id.as_deref(), Some("baz")); Ok(()) } @@ -346,14 +375,16 @@ mod tests { let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.a.com"); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?; - assert_eq!(creds.project.as_deref(), Some("p1")); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); let common_names = Some(["a.com".into(), "b.com".into()].into()); let sni = Some("p1.b.com"); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?; - assert_eq!(creds.project.as_deref(), Some("p1")); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.endpoint_id.as_deref(), Some("p1")); Ok(()) } @@ -367,8 +398,9 @@ mod tests { let common_names = Some(["localhost".into()].into()); let mut ctx = RequestMonitoring::test(); - let err = ClientCredentials::parse(&mut ctx, &options, sni, common_names) - .expect_err("should fail"); + let err = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -386,8 +418,9 @@ mod tests { let common_names = Some(["example.com".into()].into()); let mut ctx = RequestMonitoring::test(); - let err = ClientCredentials::parse(&mut ctx, &options, sni, common_names) - .expect_err("should fail"); + let err = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref()) + .expect_err("should fail"); match err { UnknownCommonName { cn } => { assert_eq!(cn, "localhost"); @@ -406,30 +439,30 @@ mod tests { let sni = Some("project.localhost"); let common_names = Some(["localhost".into()].into()); let mut ctx = RequestMonitoring::test(); - let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?; - assert_eq!(creds.project.as_deref(), Some("project")); - assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2"); + let user_info = + ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?; + assert_eq!(user_info.endpoint_id.as_deref(), Some("project")); + assert_eq!( + user_info.options.get_cache_key("project"), + "project endpoint_type:read_write lsn:0/2" + ); Ok(()) } #[test] fn test_check_peer_addr_is_in_list() { - let peer_addr = IpAddr::from([127, 0, 0, 1]); - assert!(check_peer_addr_is_in_list(&peer_addr, &vec![])); - assert!(check_peer_addr_is_in_list( - &peer_addr, - &vec!["127.0.0.1".into()] - )); - assert!(!check_peer_addr_is_in_list( - &peer_addr, - &vec!["8.8.8.8".into()] - )); + fn check(v: serde_json::Value) -> bool { + let peer_addr = IpAddr::from([127, 0, 0, 1]); + let ip_list: Vec = serde_json::from_value(v).unwrap(); + check_peer_addr_is_in_list(&peer_addr, &ip_list) + } + + assert!(check(json!([]))); + assert!(check(json!(["127.0.0.1"]))); + assert!(!check(json!(["8.8.8.8"]))); // If there is an incorrect address, it will be skipped. - assert!(check_peer_addr_is_in_list( - &peer_addr, - &vec!["88.8.8".into(), "127.0.0.1".into()] - )); + assert!(check(json!(["88.8.8", "127.0.0.1"]))); } #[test] fn test_parse_ip_v4() -> anyhow::Result<()> { diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 3151a77263..59d1ac17f4 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -4,11 +4,15 @@ use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload}; use crate::{ config::TlsServerEndPoint, console::AuthSecret, - sasl, scram, + context::RequestMonitoring, + intern::EndpointIdInt, + sasl, + scram::{self, threadpool::ThreadPool}, stream::{PqStream, Stream}, }; +use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; -use std::io; +use std::{io, sync::Arc}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; @@ -23,7 +27,7 @@ pub trait AuthMethod { pub struct Begin; /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`]. -pub struct Scram<'a>(pub &'a scram::ServerSecret); +pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring); impl AuthMethod for Scram<'_> { #[inline(always)] @@ -51,7 +55,11 @@ impl AuthMethod for PasswordHack { /// Use clear-text password auth called `password` in docs /// -pub struct CleartextPassword(pub AuthSecret); +pub struct CleartextPassword { + pub pool: Arc, + pub endpoint: EndpointIdInt, + pub secret: AuthSecret, +} impl AuthMethod for CleartextPassword { #[inline(always)] @@ -124,7 +132,13 @@ impl AuthFlow<'_, S, CleartextPassword> { .strip_suffix(&[0]) .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; - let outcome = validate_password_and_exchange(password, self.state.0)?; + let outcome = validate_password_and_exchange( + &self.state.pool, + self.state.endpoint, + password, + self.state.secret, + ) + .await?; if let sasl::Outcome::Success(_) = &outcome { self.stream.write_message_noflush(&Be::AuthenticationOk)?; @@ -138,6 +152,11 @@ impl AuthFlow<'_, S, CleartextPassword> { impl AuthFlow<'_, S, Scram<'_>> { /// Perform user authentication. Raise an error in case authentication failed. pub async fn authenticate(self) -> super::Result> { + let Scram(secret, ctx) = self.state; + + // pause the timer while we communicate with the client + let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg) @@ -148,9 +167,15 @@ impl AuthFlow<'_, S, Scram<'_>> { return Err(super::AuthError::bad_auth_method(sasl.method)); } + match sasl.method { + SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256), + SCRAM_SHA_256_PLUS => { + ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus) + } + _ => {} + } info!("client chooses {}", sasl.method); - let secret = self.state.0; let outcome = sasl::SaslStream::new(self.stream, sasl.message) .authenticate(scram::Exchange::new( secret, @@ -167,12 +192,14 @@ impl AuthFlow<'_, S, Scram<'_>> { } } -pub(super) fn validate_password_and_exchange( +pub(crate) async fn validate_password_and_exchange( + pool: &ThreadPool, + endpoint: EndpointIdInt, password: &[u8], secret: AuthSecret, ) -> super::Result> { match secret { - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] AuthSecret::Md5(_) => { // test only Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password( @@ -181,13 +208,7 @@ pub(super) fn validate_password_and_exchange( } // perform scram authentication as both client and server to validate the keys AuthSecret::Scram(scram_secret) => { - use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; - let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported()); - let outcome = crate::scram::exchange( - &scram_secret, - sasl_client, - crate::config::TlsServerEndPoint::Undefined, - )?; + let outcome = crate::scram::exchange(pool, endpoint, &scram_secret, password).await?; let client_key = match outcome { sasl::Outcome::Success(client_key) => client_key, diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs index 372b0764ee..2ddf46fe25 100644 --- a/proxy/src/auth/password_hack.rs +++ b/proxy/src/auth/password_hack.rs @@ -4,10 +4,11 @@ //! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified. use bstr::ByteSlice; -use smol_str::SmolStr; + +use crate::EndpointId; pub struct PasswordHackPayload { - pub endpoint: SmolStr, + pub endpoint: EndpointId, pub password: Vec, } diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 1edbc1e7e7..e1674049a6 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -9,13 +9,14 @@ use futures::future::Either; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; -use proxy::proxy::run_until_cancelled; +use proxy::metrics::{Metrics, ThreadPoolMetrics}; +use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled}; +use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; use anyhow::{anyhow, bail, ensure, Context}; -use clap::{self, Arg}; +use clap::Arg; use futures::TryFutureExt; -use proxy::console::messages::MetricsAuxInfo; use proxy::stream::{PqStream, Stream}; use tokio::io::{AsyncRead, AsyncWrite}; @@ -65,6 +66,8 @@ async fn main() -> anyhow::Result<()> { let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + Metrics::install(Arc::new(ThreadPoolMetrics::new(0))); + let args = cli().get_matches(); let destination: String = args.get_one::("dest").unwrap().parse()?; @@ -76,37 +79,40 @@ async fn main() -> anyhow::Result<()> { (Some(key_path), Some(cert_path)) => { let key = { let key_bytes = std::fs::read(key_path).context("TLS key file")?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; + + let mut keys = + rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() + PrivateKeyDer::Pkcs8( + keys.pop() + .unwrap() + .context(format!("Failed to read TLS keys at '{key_path}'"))?, + ) }; let cert_chain_bytes = std::fs::read(cert_path) .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - let cert_chain = { + let cert_chain: Vec<_> = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context(format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ))? - .into_iter() - .map(rustls::Certificate) - .collect_vec() + .try_collect() + .with_context(|| { + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") + })? }; // needed for channel bindings let first_cert = cert_chain.first().context("missing certificate")?; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let tls_config = rustls::ServerConfig::builder() - .with_safe_default_cipher_suites() - .with_safe_default_kx_groups() - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); + let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[ + &rustls::version::TLS13, + &rustls::version::TLS12, + ]) + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); (tls_config, tls_server_end_point) } @@ -171,16 +177,13 @@ async fn task_main( .context("failed to set socket option")?; info!(%peer_addr, "serving"); - let mut ctx = - RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni"); - handle_client( - &mut ctx, - dest_suffix, - tls_config, - tls_server_end_point, - socket, - ) - .await + let ctx = RequestMonitoring::new( + session_id, + peer_addr.ip(), + proxy::metrics::Protocol::SniRouter, + "sni", + ); + handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await } .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. @@ -202,6 +205,7 @@ async fn task_main( const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; async fn ssl_handshake( + ctx: &mut RequestMonitoring, raw_stream: S, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, @@ -231,7 +235,10 @@ async fn ssl_handshake( } Ok(Stream::Tls { - tls: Box::new(raw.upgrade(tls_config).await?), + tls: Box::new( + raw.upgrade(tls_config, !ctx.has_private_peer_addr()) + .await?, + ), tls_server_end_point, }) } @@ -240,19 +247,21 @@ async fn ssl_handshake( ?unexpected, "unexpected startup packet, rejecting connection" ); - stream.throw_error_str(ERR_INSECURE_CONNECTION).await? + stream + .throw_error_str(ERR_INSECURE_CONNECTION, proxy::error::ErrorKind::User) + .await? } } } async fn handle_client( - ctx: &mut RequestMonitoring, + mut ctx: RequestMonitoring, dest_suffix: Arc, tls_config: Arc, tls_server_end_point: TlsServerEndPoint, stream: impl AsyncRead + AsyncWrite + Unpin, ) -> anyhow::Result<()> { - let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?; + let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?; // Cut off first part of the SNI domain // We receive required destination details in the format of @@ -269,8 +278,15 @@ async fn handle_client( info!("destination: {}", destination); - let client = tokio::net::TcpStream::connect(destination).await?; + let mut client = tokio::net::TcpStream::connect(destination).await?; - let metrics_aux: MetricsAuxInfo = Default::default(); - proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await + // doesn't yet matter as pg-sni-router doesn't report analytics logs + ctx.set_success(); + ctx.log_connect(); + + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?; + + Ok(()) } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index d42906aa4a..dffebf5580 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,31 +1,50 @@ +use aws_config::environment::EnvironmentVariableCredentialsProvider; +use aws_config::imds::credentials::ImdsCredentialsProvider; +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::meta::region::RegionProviderChain; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::provider_config::ProviderConfig; +use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use futures::future::Either; use proxy::auth; +use proxy::auth::backend::AuthRateLimiter; +use proxy::auth::backend::MaybeOwned; +use proxy::cancellation::CancelMap; +use proxy::cancellation::CancellationHandler; +use proxy::config::remote_storage_from_toml; use proxy::config::AuthenticationConfig; use proxy::config::CacheOptions; use proxy::config::HttpConfig; +use proxy::config::ProjectInfoCacheOptions; use proxy::console; -use proxy::console::provider::AllowedIpsCache; -use proxy::console::provider::NodeInfoCache; -use proxy::console::provider::RoleSecretCache; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; +use proxy::http::health_server::AppMetrics; +use proxy::metrics::Metrics; use proxy::rate_limiter::EndpointRateLimiter; use proxy::rate_limiter::RateBucketInfo; -use proxy::rate_limiter::RateLimiterConfig; +use proxy::redis::cancellation_publisher::RedisPublisherClient; +use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use proxy::redis::elasticache; +use proxy::redis::notifications; +use proxy::scram::threadpool::ThreadPool; +use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; use proxy::usage_metrics; use anyhow::bail; use proxy::config::{self, ProxyConfig}; use proxy::serverless; +use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; -use std::{borrow::Cow, net::SocketAddr}; use tokio::net::TcpListener; +use tokio::sync::Mutex; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; +use tracing::Instrument; use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); @@ -33,6 +52,9 @@ project_build_tag!(BUILD_TAG); use clap::{Parser, ValueEnum}; +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + #[derive(Clone, Debug, ValueEnum)] enum AuthBackend { Console, @@ -85,6 +107,9 @@ struct ProxyCliArgs { /// path to directory with TLS certificates for client postgres connections #[clap(long)] certs_dir: Option, + /// timeout for the TLS handshake + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + handshake_timeout: tokio::time::Duration, /// http endpoint to receive periodic metric updates #[clap(long)] metric_collection_endpoint: Option, @@ -95,8 +120,11 @@ struct ProxyCliArgs { #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] wake_compute_cache: String, /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). - #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)] wake_compute_lock: String, + /// lock for `connect_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable). + #[clap(long, default_value = config::ConcurrencyLockOptions::DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK)] + connect_compute_lock: String, /// Allow self-signed certificates for compute nodes (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] allow_self_signed_compute: bool, @@ -105,29 +133,36 @@ struct ProxyCliArgs { /// timeout for scram authentication protocol #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] scram_protocol_timeout: tokio::time::Duration, + /// size of the threadpool for password hashing + #[clap(long, default_value_t = 4)] + scram_thread_pool_size: u8, /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] require_client_ip: bool, /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_dynamic_rate_limiter: bool, - /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`. - #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)] - rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm, - /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error. - #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] - rate_limiter_timeout: tokio::time::Duration, /// Endpoint rate limiter max number of requests per second. /// /// Provided in the form '@'. /// Can be given multiple times for different bucket sizes. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)] endpoint_rps_limit: Vec, - /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. - #[clap(long, default_value_t = 100)] - initial_limit: usize, - #[clap(flatten)] - aimd_config: proxy::rate_limiter::AimdConfig, + /// Wake compute rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + wake_compute_limit: Vec, + /// Whether the auth rate limiter actually takes effect (for testing) + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + auth_rate_limit_enabled: bool, + /// Authentication rate limiter max number of hashes per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)] + auth_rate_limit: Vec, + /// The IP subnet to use when considering whether two IP addresses are considered the same. + #[clap(long, default_value_t = 64)] + auth_rate_limit_ip_subnet: u8, + /// Redis rate limiter max number of requests per second. + #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] + redis_rps_limit: Vec, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, @@ -137,9 +172,51 @@ struct ProxyCliArgs { /// disable ip check for http requests. If it is too time consuming, it could be turned off. #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_ip_check_for_http: bool, - + /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) + #[clap(long)] + redis_notifications: Option, + /// redis host for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_host: Option, + /// redis port for streaming connections (might be different from the notifications host) + #[clap(long)] + redis_port: Option, + /// redis cluster name, used in aws elasticache + #[clap(long)] + redis_cluster_name: Option, + /// redis user_id, used in aws elasticache + #[clap(long)] + redis_user_id: Option, + /// aws region to retrieve credentials + #[clap(long, default_value_t = String::new())] + aws_region: String, + /// cache for `project_info` (use `size=0` to disable) + #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)] + project_info_cache: String, + /// cache for all valid endpoints + #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)] + endpoint_cache_config: String, #[clap(flatten)] parquet_upload: ParquetUploadArgs, + + /// interval for backup metric collection + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + metric_backup_collection_interval: std::time::Duration, + /// remote storage configuration for backup metric collection + /// Encoded as toml (same format as pageservers), eg + /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` + #[clap(long, default_value = "{}")] + metric_backup_collection_remote_storage: String, + /// chunk size for backup metric collection + /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. + #[clap(long, default_value = "4194304")] + metric_backup_collection_chunk_size: usize, + /// Whether to retry the connection to the compute node + #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] + connect_to_compute_retry: String, + /// Whether to retry the wake_compute request + #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)] + wake_compute_retry: String, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -156,6 +233,10 @@ struct SqlOverHttpArgs { #[clap(long, default_value_t = 20)] sql_over_http_pool_max_conns_per_endpoint: usize, + /// How many connections to pool for each endpoint. Excess connections are discarded + #[clap(long, default_value_t = 20000)] + sql_over_http_pool_max_total_conns: usize, + /// How long pooled connections should remain idle for before closing #[clap(long, default_value = "5m", value_parser = humantime::parse_duration)] sql_over_http_idle_timeout: tokio::time::Duration, @@ -170,6 +251,12 @@ struct SqlOverHttpArgs { /// increase memory used by the pool #[clap(long, default_value_t = 128)] sql_over_http_pool_shards: usize, + + #[clap(long, default_value_t = 10000)] + sql_over_http_client_conn_threshold: u64, + + #[clap(long, default_value_t = 64)] + sql_over_http_cancel_set_shards: usize, } #[tokio::main] @@ -180,12 +267,78 @@ async fn main() -> anyhow::Result<()> { info!("Version: {GIT_VERSION}"); info!("Build_tag: {BUILD_TAG}"); - ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG); + let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }); + + let jemalloc = match proxy::jemalloc::MetricRecorder::new() { + Ok(t) => Some(t), + Err(e) => { + tracing::error!(error = ?e, "could not start jemalloc metrics loop"); + None + } + }; let args = ProxyCliArgs::parse(); let config = build_config(&args)?; info!("Authentication backend: {}", config.auth_backend); + info!("Using region: {}", config.aws_region); + + let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed + let provider_conf = + ProviderConfig::without_region().with_region(region_provider.region().await); + let aws_credentials_provider = { + // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new()) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // needed to access remote extensions bucket + .or_else( + "token", + WebIdentityTokenCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses imds v2 + .or_else("imds", ImdsCredentialsProvider::builder().build()) + }; + let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( + elasticache::AWSIRSAConfig::new( + config.aws_region.clone(), + args.redis_cluster_name, + args.redis_user_id, + ), + aws_credentials_provider, + )); + let regional_redis_client = match (args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host, + port, + elasticache_credentials_provider.clone(), + ), + ), + (None, None) => { + warn!("Redis events from console are disabled"); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }; + let redis_notifications_client = if let Some(url) = args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) + } else { + regional_redis_client.clone() + }; // Check that we can bind to address before further initialization let http_address: SocketAddr = args.http.parse()?; @@ -201,7 +354,27 @@ async fn main() -> anyhow::Result<()> { let proxy_listener = TcpListener::bind(proxy_address).await?; let cancellation_token = CancellationToken::new(); - let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit)); + let cancel_map = CancelMap::default(); + + let redis_publisher = match ®ional_redis_client { + Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new( + redis_publisher.clone(), + args.region.clone(), + &config.redis_rps_limit, + )?))), + None => None, + }; + let cancellation_handler = Arc::new(CancellationHandler::< + Option>>, + >::new( + cancel_map.clone(), + redis_publisher, + proxy::metrics::CancellationSource::FromClient, + )); + + let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); + RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit)); // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) @@ -210,6 +383,7 @@ async fn main() -> anyhow::Result<()> { config, proxy_listener, cancellation_token.clone(), + cancellation_handler.clone(), endpoint_rate_limiter.clone(), )); @@ -224,6 +398,7 @@ async fn main() -> anyhow::Result<()> { config, serverless_listener, cancellation_token.clone(), + cancellation_handler.clone(), endpoint_rate_limiter.clone(), )); } @@ -235,12 +410,61 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals(cancellation_token)); - maintenance_tasks.spawn(http::health_server::task_main(http_listener)); + maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone())); + maintenance_tasks.spawn(http::health_server::task_main( + http_listener, + AppMetrics { + jemalloc, + neon_metrics, + proxy: proxy::metrics::Metrics::get(), + }, + )); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); if let Some(metrics_config) = &config.metric_collection { + // TODO: Add gc regardles of the metric collection being enabled. maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); + client_tasks.spawn(usage_metrics::task_backup( + &metrics_config.backup_metric_collection_config, + cancellation_token.clone(), + )); + } + + if let auth::BackendType::Console(api, _) = &config.auth_backend { + if let proxy::console::provider::ConsoleBackend::Console(api) = &**api { + match (redis_notifications_client, regional_redis_client.clone()) { + (None, None) => {} + (client1, client2) => { + let cache = api.caches.project_info.clone(); + if let Some(client) = client1 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + if let Some(client) = client2 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + } + } + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn( + async move { cache.do_read(con, cancellation_token.clone()).await } + .instrument(span), + ); + } + } } let maintenance = loop { @@ -269,6 +493,9 @@ async fn main() -> anyhow::Result<()> { /// ProxyConfig is created at proxy startup, and lives forever. fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { + let thread_pool = ThreadPool::new(args.scram_thread_pool_size); + Metrics::install(thread_pool.metrics.clone()); + let tls_config = match (&args.tls_key, &args.tls_cert) { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, @@ -282,6 +509,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { if args.allow_self_signed_compute { warn!("allowing self-signed compute certificates"); } + let backup_metric_collection_config = config::MetricBackupCollectionConfig { + interval: args.metric_backup_collection_interval, + remote_storage_config: remote_storage_from_toml( + &args.metric_backup_collection_remote_storage, + )?, + chunk_size: args.metric_backup_collection_chunk_size, + }; let metric_collection = match ( &args.metric_collection_endpoint, @@ -290,6 +524,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { endpoint: endpoint.parse()?, interval: humantime::parse_duration(interval)?, + backup_metric_collection_config, }), (None, None) => None, _ => bail!( @@ -297,74 +532,96 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { and metric-collection-interval must be specified" ), }; - let rate_limiter_config = RateLimiterConfig { - disable: args.disable_dynamic_rate_limiter, - algorithm: args.rate_limit_algorithm, - timeout: args.rate_limiter_timeout, - initial_limit: args.initial_limit, - aimd_config: Some(args.aimd_config), - }; + if !args.disable_dynamic_rate_limiter { + bail!("dynamic rate limiter should be disabled"); + } let auth_backend = match &args.auth_backend { AuthBackend::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; - let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?; - let role_secret_cache_config: CacheOptions = args.role_secret_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); - info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}"); - info!("Using RoleSecretCache (wake_compute) with options={role_secret_cache_config:?}"); - let caches = Box::leak(Box::new(console::caches::ApiCaches { - node_info: NodeInfoCache::new( - "node_info_cache", - wake_compute_cache_config.size, - wake_compute_cache_config.ttl, - true, - ), - allowed_ips: AllowedIpsCache::new( - "allowed_ips_cache", - allowed_ips_cache_config.size, - allowed_ips_cache_config.ttl, - false, - ), - role_secret: RoleSecretCache::new( - "role_secret_cache", - role_secret_cache_config.size, - role_secret_cache_config.ttl, - false, - ), - })); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(console::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); - let config::WakeComputeLockOptions { + let config::ConcurrencyLockOptions { shards, - permits, + limiter, epoch, timeout, } = args.wake_compute_lock.parse()?; - info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)"); - let locks = Box::leak(Box::new( - console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout) - .unwrap(), - )); - tokio::spawn(locks.garbage_collect_worker(epoch)); + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(console::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + )?)); + tokio::spawn(locks.garbage_collect_worker()); let url = args.auth_endpoint.parse()?; - let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config)); + let endpoint = http::Endpoint::new(url, http::new_client()); - let api = console::provider::neon::Api::new(endpoint, caches, locks); - auth::BackendType::Console(Cow::Owned(api), ()) + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(EndpointRateLimiter::new(wake_compute_rps_limit)); + let api = console::provider::neon::Api::new( + endpoint, + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + let api = console::provider::ConsoleBackend::Console(api); + auth::BackendType::Console(MaybeOwned::Owned(api), ()) } #[cfg(feature = "testing")] AuthBackend::Postgres => { let url = args.auth_endpoint.parse()?; let api = console::provider::mock::Api::new(url); - auth::BackendType::Postgres(Cow::Owned(api), ()) + let api = console::provider::ConsoleBackend::Postgres(api); + auth::BackendType::Console(MaybeOwned::Owned(api), ()) } AuthBackend::Link => { let url = args.uri.parse()?; - auth::BackendType::Link(Cow::Owned(url)) + auth::BackendType::Link(MaybeOwned::Owned(url), ()) } }; + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = console::locks::ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + )?; + let http_config = HttpConfig { request_timeout: args.sql_over_http.sql_over_http_timeout, pool_options: GlobalConnPoolOptions { @@ -373,14 +630,21 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { pool_shards: args.sql_over_http.sql_over_http_pool_shards, idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, }; let authentication_config = AuthenticationConfig { + thread_pool, scram_protocol_timeout: args.scram_protocol_timeout, + rate_limiter_enabled: args.auth_rate_limit_enabled, + rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), + rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, }; - let mut endpoint_rps_limit = args.endpoint_rps_limit.clone(); - RateBucketInfo::validate(&mut endpoint_rps_limit)?; + let mut redis_rps_limit = args.redis_rps_limit.clone(); + RateBucketInfo::validate(&mut redis_rps_limit)?; let config = Box::leak(Box::new(ProxyConfig { tls_config, @@ -391,11 +655,19 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { authentication_config, require_client_ip: args.require_client_ip, disable_ip_check_for_http: args.disable_ip_check_for_http, - endpoint_rps_limit, - // TODO: add this argument + redis_rps_limit, + handshake_timeout: args.handshake_timeout, region: args.region.clone(), + aws_region: args.aws_region.clone(), + wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, + connect_compute_locks, + connect_to_compute_retry_config: config::RetryConfig::parse( + &args.connect_to_compute_retry, + )?, })); + tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); + Ok(config) } diff --git a/proxy/src/cache.rs b/proxy/src/cache.rs index f54f360b01..d1d4087241 100644 --- a/proxy/src/cache.rs +++ b/proxy/src/cache.rs @@ -1,311 +1,7 @@ -use std::{ - borrow::Borrow, - hash::Hash, - ops::{Deref, DerefMut}, - time::{Duration, Instant}, -}; -use tracing::debug; - -// This seems to make more sense than `lru` or `cached`: -// -// * `near/nearcore` ditched `cached` in favor of `lru` -// (https://github.com/near/nearcore/issues?q=is%3Aissue+lru+is%3Aclosed). -// -// * `lru` methods use an obscure `KeyRef` type in their contraints (which is deliberately excluded from docs). -// This severely hinders its usage both in terms of creating wrappers and supported key types. -// -// On the other hand, `hashlink` has good download stats and appears to be maintained. -use hashlink::{linked_hash_map::RawEntryMut, LruCache}; - -/// A generic trait which exposes types of cache's key and value, -/// as well as the notion of cache entry invalidation. -/// This is useful for [`timed_lru::Cached`]. -pub trait Cache { - /// Entry's key. - type Key; - - /// Entry's value. - type Value; - - /// Used for entry invalidation. - type LookupInfo; - - /// Invalidate an entry using a lookup info. - /// We don't have an empty default impl because it's error-prone. - fn invalidate(&self, _: &Self::LookupInfo); -} - -impl Cache for &C { - type Key = C::Key; - type Value = C::Value; - type LookupInfo = C::LookupInfo; - - fn invalidate(&self, info: &Self::LookupInfo) { - C::invalidate(self, info) - } -} +pub mod common; +pub mod endpoints; +pub mod project_info; +mod timed_lru; +pub use common::{Cache, Cached}; pub use timed_lru::TimedLru; -pub mod timed_lru { - use super::*; - - /// An implementation of timed LRU cache with fixed capacity. - /// Key properties: - /// - /// * Whenever a new entry is inserted, the least recently accessed one is evicted. - /// The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`). - /// - /// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp. - /// If the entry has expired, we remove it from the cache; Otherwise we bump the - /// expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong - /// its existence. - /// - /// * There's an API for immediate invalidation (removal) of a cache entry; - /// It's useful in case we know for sure that the entry is no longer correct. - /// See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information. - /// - /// * Expired entries are kept in the cache, until they are evicted by the LRU policy, - /// or by a successful lookup (i.e. the entry hasn't expired yet). - /// There is no background job to reap the expired records. - /// - /// * It's possible for an entry that has not yet expired entry to be evicted - /// before expired items. That's a bit wasteful, but probably fine in practice. - pub struct TimedLru { - /// Cache's name for tracing. - name: &'static str, - - /// The underlying cache implementation. - cache: parking_lot::Mutex>>, - - /// Default time-to-live of a single entry. - ttl: Duration, - - update_ttl_on_retrieval: bool, - } - - impl Cache for TimedLru { - type Key = K; - type Value = V; - type LookupInfo = LookupInfo; - - fn invalidate(&self, info: &Self::LookupInfo) { - self.invalidate_raw(info) - } - } - - struct Entry { - created_at: Instant, - expires_at: Instant, - value: T, - } - - impl TimedLru { - /// Construct a new LRU cache with timed entries. - pub fn new( - name: &'static str, - capacity: usize, - ttl: Duration, - update_ttl_on_retrieval: bool, - ) -> Self { - Self { - name, - cache: LruCache::new(capacity).into(), - ttl, - update_ttl_on_retrieval, - } - } - - /// Drop an entry from the cache if it's outdated. - #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] - fn invalidate_raw(&self, info: &LookupInfo) { - let now = Instant::now(); - - // Do costly things before taking the lock. - let mut cache = self.cache.lock(); - let raw_entry = match cache.raw_entry_mut().from_key(&info.key) { - RawEntryMut::Vacant(_) => return, - RawEntryMut::Occupied(x) => x, - }; - - // Remove the entry if it was created prior to lookup timestamp. - let entry = raw_entry.get(); - let (created_at, expires_at) = (entry.created_at, entry.expires_at); - let should_remove = created_at <= info.created_at || expires_at <= now; - - if should_remove { - raw_entry.remove(); - } - - drop(cache); // drop lock before logging - debug!( - created_at = format_args!("{created_at:?}"), - expires_at = format_args!("{expires_at:?}"), - entry_removed = should_remove, - "processed a cache entry invalidation event" - ); - } - - /// Try retrieving an entry by its key, then execute `extract` if it exists. - #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] - fn get_raw(&self, key: &Q, extract: impl FnOnce(&K, &Entry) -> R) -> Option - where - K: Borrow, - Q: Hash + Eq + ?Sized, - { - let now = Instant::now(); - let deadline = now.checked_add(self.ttl).expect("time overflow"); - - // Do costly things before taking the lock. - let mut cache = self.cache.lock(); - let mut raw_entry = match cache.raw_entry_mut().from_key(key) { - RawEntryMut::Vacant(_) => return None, - RawEntryMut::Occupied(x) => x, - }; - - // Immeditely drop the entry if it has expired. - let entry = raw_entry.get(); - if entry.expires_at <= now { - raw_entry.remove(); - return None; - } - - let value = extract(raw_entry.key(), entry); - let (created_at, expires_at) = (entry.created_at, entry.expires_at); - - // Update the deadline and the entry's position in the LRU list. - if self.update_ttl_on_retrieval { - raw_entry.get_mut().expires_at = deadline; - } - raw_entry.to_back(); - - drop(cache); // drop lock before logging - debug!( - created_at = format_args!("{created_at:?}"), - old_expires_at = format_args!("{expires_at:?}"), - new_expires_at = format_args!("{deadline:?}"), - "accessed a cache entry" - ); - - Some(value) - } - - /// Insert an entry to the cache. If an entry with the same key already - /// existed, return the previous value and its creation timestamp. - #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] - fn insert_raw(&self, key: K, value: V) -> (Instant, Option) { - let created_at = Instant::now(); - let expires_at = created_at.checked_add(self.ttl).expect("time overflow"); - - let entry = Entry { - created_at, - expires_at, - value, - }; - - // Do costly things before taking the lock. - let old = self - .cache - .lock() - .insert(key, entry) - .map(|entry| entry.value); - - debug!( - created_at = format_args!("{created_at:?}"), - expires_at = format_args!("{expires_at:?}"), - replaced = old.is_some(), - "created a cache entry" - ); - - (created_at, old) - } - } - - impl TimedLru { - pub fn insert(&self, key: K, value: V) -> (Option, Cached<&Self>) { - let (created_at, old) = self.insert_raw(key.clone(), value.clone()); - - let cached = Cached { - token: Some((self, LookupInfo { created_at, key })), - value, - }; - - (old, cached) - } - } - - impl TimedLru { - /// Retrieve a cached entry in convenient wrapper. - pub fn get(&self, key: &Q) -> Option> - where - K: Borrow + Clone, - Q: Hash + Eq + ?Sized, - { - self.get_raw(key, |key, entry| { - let info = LookupInfo { - created_at: entry.created_at, - key: key.clone(), - }; - - Cached { - token: Some((self, info)), - value: entry.value.clone(), - } - }) - } - } - - /// Lookup information for key invalidation. - pub struct LookupInfo { - /// Time of creation of a cache [`Entry`]. - /// We use this during invalidation lookups to prevent eviction of a newer - /// entry sharing the same key (it might've been inserted by a different - /// task after we got the entry we're trying to invalidate now). - created_at: Instant, - - /// Search by this key. - key: K, - } - - /// Wrapper for convenient entry invalidation. - pub struct Cached { - /// Cache + lookup info. - token: Option<(C, C::LookupInfo)>, - - /// The value itself. - value: C::Value, - } - - impl Cached { - /// Place any entry into this wrapper; invalidation will be a no-op. - pub fn new_uncached(value: C::Value) -> Self { - Self { token: None, value } - } - - /// Drop this entry from a cache if it's still there. - pub fn invalidate(self) -> C::Value { - if let Some((cache, info)) = &self.token { - cache.invalidate(info); - } - self.value - } - - /// Tell if this entry is actually cached. - pub fn cached(&self) -> bool { - self.token.is_some() - } - } - - impl Deref for Cached { - type Target = C::Value; - - fn deref(&self) -> &Self::Target { - &self.value - } - } - - impl DerefMut for Cached { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.value - } - } -} diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs new file mode 100644 index 0000000000..bc1c37512b --- /dev/null +++ b/proxy/src/cache/common.rs @@ -0,0 +1,82 @@ +use std::ops::{Deref, DerefMut}; + +/// A generic trait which exposes types of cache's key and value, +/// as well as the notion of cache entry invalidation. +/// This is useful for [`Cached`]. +pub trait Cache { + /// Entry's key. + type Key; + + /// Entry's value. + type Value; + + /// Used for entry invalidation. + type LookupInfo; + + /// Invalidate an entry using a lookup info. + /// We don't have an empty default impl because it's error-prone. + fn invalidate(&self, _: &Self::LookupInfo); +} + +impl Cache for &C { + type Key = C::Key; + type Value = C::Value; + type LookupInfo = C::LookupInfo; + + fn invalidate(&self, info: &Self::LookupInfo) { + C::invalidate(self, info) + } +} + +/// Wrapper for convenient entry invalidation. +pub struct Cached::Value> { + /// Cache + lookup info. + pub token: Option<(C, C::LookupInfo)>, + + /// The value itself. + pub value: V, +} + +impl Cached { + /// Place any entry into this wrapper; invalidation will be a no-op. + pub fn new_uncached(value: V) -> Self { + Self { token: None, value } + } + + pub fn take_value(self) -> (Cached, V) { + ( + Cached { + token: self.token, + value: (), + }, + self.value, + ) + } + + /// Drop this entry from a cache if it's still there. + pub fn invalidate(self) -> V { + if let Some((cache, info)) = &self.token { + cache.invalidate(info); + } + self.value + } + + /// Tell if this entry is actually cached. + pub fn cached(&self) -> bool { + self.token.is_some() + } +} + +impl Deref for Cached { + type Target = V; + + fn deref(&self) -> &Self::Target { + &self.value + } +} + +impl DerefMut for Cached { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.value + } +} diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs new file mode 100644 index 0000000000..4bc10a6020 --- /dev/null +++ b/proxy/src/cache/endpoints.rs @@ -0,0 +1,247 @@ +use std::{ + convert::Infallible, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; + +use dashmap::DashSet; +use redis::{ + streams::{StreamReadOptions, StreamReadReply}, + AsyncCommands, FromRedisValue, Value, +}; +use serde::Deserialize; +use tokio::sync::Mutex; +use tokio_util::sync::CancellationToken; +use tracing::info; + +use crate::{ + config::EndpointCacheConfig, + context::RequestMonitoring, + intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, + metrics::{Metrics, RedisErrors, RedisEventsCount}, + rate_limiter::GlobalRateLimiter, + redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, + EndpointId, +}; + +#[derive(Deserialize, Debug, Clone)] +pub struct ControlPlaneEventKey { + endpoint_created: Option, + branch_created: Option, + project_created: Option, +} +#[derive(Deserialize, Debug, Clone)] +struct EndpointCreated { + endpoint_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct BranchCreated { + branch_id: String, +} +#[derive(Deserialize, Debug, Clone)] +struct ProjectCreated { + project_id: String, +} + +pub struct EndpointsCache { + config: EndpointCacheConfig, + endpoints: DashSet, + branches: DashSet, + projects: DashSet, + ready: AtomicBool, + limiter: Arc>, +} + +impl EndpointsCache { + pub fn new(config: EndpointCacheConfig) -> Self { + Self { + limiter: Arc::new(Mutex::new(GlobalRateLimiter::new( + config.limiter_info.clone(), + ))), + config, + endpoints: DashSet::new(), + branches: DashSet::new(), + projects: DashSet::new(), + ready: AtomicBool::new(false), + } + } + pub async fn is_valid(&self, ctx: &mut RequestMonitoring, endpoint: &EndpointId) -> bool { + if !self.ready.load(Ordering::Acquire) { + return true; + } + let rejected = self.should_reject(endpoint); + ctx.set_rejected(rejected); + info!(?rejected, "check endpoint is valid, disabled cache"); + // If cache is disabled, just collect the metrics and return or + // If the limiter allows, we don't need to check the cache. + if self.config.disable_cache || self.limiter.lock().await.check() { + return true; + } + !rejected + } + fn should_reject(&self, endpoint: &EndpointId) -> bool { + if endpoint.is_endpoint() { + !self.endpoints.contains(&EndpointIdInt::from(endpoint)) + } else if endpoint.is_branch() { + !self + .branches + .contains(&BranchIdInt::from(&endpoint.as_branch())) + } else { + !self + .projects + .contains(&ProjectIdInt::from(&endpoint.as_project())) + } + } + fn insert_event(&self, key: ControlPlaneEventKey) { + // Do not do normalization here, we expect the events to be normalized. + if let Some(endpoint_created) = key.endpoint_created { + self.endpoints + .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::EndpointCreated); + } + if let Some(branch_created) = key.branch_created { + self.branches + .insert(BranchIdInt::from(&branch_created.branch_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::BranchCreated); + } + if let Some(project_created) = key.project_created { + self.projects + .insert(ProjectIdInt::from(&project_created.project_id.into())); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::ProjectCreated); + } + } + pub async fn do_read( + &self, + mut con: ConnectionWithCredentialsProvider, + cancellation_token: CancellationToken, + ) -> anyhow::Result { + let mut last_id = "0-0".to_string(); + loop { + if let Err(e) = con.connect().await { + tracing::error!("error connecting to redis: {:?}", e); + self.ready.store(false, Ordering::Release); + } + if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await { + tracing::error!("error reading from redis: {:?}", e); + self.ready.store(false, Ordering::Release); + } + if cancellation_token.is_cancelled() { + info!("cancellation token is cancelled, exiting"); + tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await; + // 1 week. + } + tokio::time::sleep(self.config.retry_interval).await; + } + } + async fn read_from_stream( + &self, + con: &mut ConnectionWithCredentialsProvider, + last_id: &mut String, + ) -> anyhow::Result<()> { + tracing::info!("reading endpoints/branches/projects from redis"); + self.batch_read( + con, + StreamReadOptions::default().count(self.config.initial_batch_size), + last_id, + true, + ) + .await?; + tracing::info!("ready to filter user requests"); + self.ready.store(true, Ordering::Release); + self.batch_read( + con, + StreamReadOptions::default() + .count(self.config.default_batch_size) + .block(self.config.xread_timeout.as_millis() as usize), + last_id, + false, + ) + .await + } + fn parse_key_value(value: &Value) -> anyhow::Result { + let s: String = FromRedisValue::from_redis_value(value)?; + Ok(serde_json::from_str(&s)?) + } + async fn batch_read( + &self, + conn: &mut ConnectionWithCredentialsProvider, + opts: StreamReadOptions, + last_id: &mut String, + return_when_finish: bool, + ) -> anyhow::Result<()> { + let mut total: usize = 0; + loop { + let mut res: StreamReadReply = conn + .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts) + .await?; + + if res.keys.is_empty() { + if return_when_finish { + if total != 0 { + break; + } + anyhow::bail!( + "Redis stream {} is empty, cannot be used to filter endpoints", + self.config.stream_name + ); + } + // If we are not returning when finish, we should wait for more data. + continue; + } + if res.keys.len() != 1 { + anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name); + } + + let res = res.keys.pop().expect("Checked length above"); + let len = res.ids.len(); + for x in res.ids { + total += 1; + for (_, v) in x.map { + let key = match Self::parse_key_value(&v) { + Ok(x) => x, + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: &self.config.stream_name, + }); + tracing::error!("error parsing value {v:?}: {e:?}"); + continue; + } + }; + self.insert_event(key); + } + if total.is_power_of_two() { + tracing::debug!("endpoints read {}", total); + } + *last_id = x.id; + } + if return_when_finish && len <= self.config.default_batch_size { + break; + } + } + tracing::info!("read {} endpoints/branches/projects from redis", total); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::ControlPlaneEventKey; + + #[test] + fn test() { + let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}"; + let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap(); + } +} diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs new file mode 100644 index 0000000000..10cc4ceee1 --- /dev/null +++ b/proxy/src/cache/project_info.rs @@ -0,0 +1,574 @@ +use std::{ + collections::HashSet, + convert::Infallible, + sync::{atomic::AtomicU64, Arc}, + time::Duration, +}; + +use async_trait::async_trait; +use dashmap::DashMap; +use rand::{thread_rng, Rng}; +use smol_str::SmolStr; +use tokio::sync::Mutex; +use tokio::time::Instant; +use tracing::{debug, info}; + +use crate::{ + auth::IpPattern, + config::ProjectInfoCacheOptions, + console::AuthSecret, + intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, + EndpointId, RoleName, +}; + +use super::{Cache, Cached}; + +#[async_trait] +pub trait ProjectInfoCache { + fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt); + fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt); + async fn decrement_active_listeners(&self); + async fn increment_active_listeners(&self); +} + +struct Entry { + created_at: Instant, + value: T, +} + +impl Entry { + pub fn new(value: T) -> Self { + Self { + created_at: Instant::now(), + value, + } + } +} + +impl From for Entry { + fn from(value: T) -> Self { + Self::new(value) + } +} + +#[derive(Default)] +struct EndpointInfo { + secret: std::collections::HashMap>>, + allowed_ips: Option>>>, +} + +impl EndpointInfo { + fn check_ignore_cache(ignore_cache_since: Option, created_at: Instant) -> bool { + match ignore_cache_since { + None => false, + Some(t) => t < created_at, + } + } + pub fn get_role_secret( + &self, + role_name: RoleNameInt, + valid_since: Instant, + ignore_cache_since: Option, + ) -> Option<(Option, bool)> { + if let Some(secret) = self.secret.get(&role_name) { + if valid_since < secret.created_at { + return Some(( + secret.value.clone(), + Self::check_ignore_cache(ignore_cache_since, secret.created_at), + )); + } + } + None + } + + pub fn get_allowed_ips( + &self, + valid_since: Instant, + ignore_cache_since: Option, + ) -> Option<(Arc>, bool)> { + if let Some(allowed_ips) = &self.allowed_ips { + if valid_since < allowed_ips.created_at { + return Some(( + allowed_ips.value.clone(), + Self::check_ignore_cache(ignore_cache_since, allowed_ips.created_at), + )); + } + } + None + } + pub fn invalidate_allowed_ips(&mut self) { + self.allowed_ips = None; + } + pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) { + self.secret.remove(&role_name); + } +} + +/// Cache for project info. +/// This is used to cache auth data for endpoints. +/// Invalidation is done by console notifications or by TTL (if console notifications are disabled). +/// +/// We also store endpoint-to-project mapping in the cache, to be able to access per-endpoint data. +/// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available? +/// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache. +pub struct ProjectInfoCacheImpl { + cache: DashMap, + + project2ep: DashMap>, + config: ProjectInfoCacheOptions, + + start_time: Instant, + ttl_disabled_since_us: AtomicU64, + active_listeners_lock: Mutex, +} + +#[async_trait] +impl ProjectInfoCache for ProjectInfoCacheImpl { + fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) { + info!("invalidating allowed ips for project `{}`", project_id); + let endpoints = self + .project2ep + .get(&project_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_allowed_ips(); + } + } + } + fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt) { + info!( + "invalidating role secret for project_id `{}` and role_name `{}`", + project_id, role_name, + ); + let endpoints = self + .project2ep + .get(&project_id) + .map(|kv| kv.value().clone()) + .unwrap_or_default(); + for endpoint_id in endpoints { + if let Some(mut endpoint_info) = self.cache.get_mut(&endpoint_id) { + endpoint_info.invalidate_role_secret(role_name); + } + } + } + async fn decrement_active_listeners(&self) { + let mut listeners_guard = self.active_listeners_lock.lock().await; + if *listeners_guard == 0 { + tracing::error!("active_listeners count is already 0, something is broken"); + return; + } + *listeners_guard -= 1; + if *listeners_guard == 0 { + self.ttl_disabled_since_us + .store(u64::MAX, std::sync::atomic::Ordering::SeqCst); + } + } + + async fn increment_active_listeners(&self) { + let mut listeners_guard = self.active_listeners_lock.lock().await; + *listeners_guard += 1; + if *listeners_guard == 1 { + let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64; + self.ttl_disabled_since_us + .store(new_ttl, std::sync::atomic::Ordering::SeqCst); + } + } +} + +impl ProjectInfoCacheImpl { + pub fn new(config: ProjectInfoCacheOptions) -> Self { + Self { + cache: DashMap::new(), + project2ep: DashMap::new(), + config, + ttl_disabled_since_us: AtomicU64::new(u64::MAX), + start_time: Instant::now(), + active_listeners_lock: Mutex::new(0), + } + } + + pub fn get_role_secret( + &self, + endpoint_id: &EndpointId, + role_name: &RoleName, + ) -> Option>> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; + let role_name = RoleNameInt::get(role_name)?; + let (valid_since, ignore_cache_since) = self.get_cache_times(); + let endpoint_info = self.cache.get(&endpoint_id)?; + let (value, ignore_cache) = + endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?; + if !ignore_cache { + let cached = Cached { + token: Some(( + self, + CachedLookupInfo::new_role_secret(endpoint_id, role_name), + )), + value, + }; + return Some(cached); + } + Some(Cached::new_uncached(value)) + } + pub fn get_allowed_ips( + &self, + endpoint_id: &EndpointId, + ) -> Option>>> { + let endpoint_id = EndpointIdInt::get(endpoint_id)?; + let (valid_since, ignore_cache_since) = self.get_cache_times(); + let endpoint_info = self.cache.get(&endpoint_id)?; + let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since); + let (value, ignore_cache) = value?; + if !ignore_cache { + let cached = Cached { + token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id))), + value, + }; + return Some(cached); + } + Some(Cached::new_uncached(value)) + } + pub fn insert_role_secret( + &self, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + role_name: RoleNameInt, + secret: Option, + ) { + if self.cache.len() >= self.config.size { + // If there are too many entries, wait until the next gc cycle. + return; + } + self.insert_project2endpoint(project_id, endpoint_id); + let mut entry = self.cache.entry(endpoint_id).or_default(); + if entry.secret.len() < self.config.max_roles { + entry.secret.insert(role_name, secret.into()); + } + } + pub fn insert_allowed_ips( + &self, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + allowed_ips: Arc>, + ) { + if self.cache.len() >= self.config.size { + // If there are too many entries, wait until the next gc cycle. + return; + } + self.insert_project2endpoint(project_id, endpoint_id); + self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into()); + } + fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) { + if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) { + endpoints.insert(endpoint_id); + } else { + self.project2ep + .insert(project_id, HashSet::from([endpoint_id])); + } + } + fn get_cache_times(&self) -> (Instant, Option) { + let mut valid_since = Instant::now() - self.config.ttl; + // Only ignore cache if ttl is disabled. + let ttl_disabled_since_us = self + .ttl_disabled_since_us + .load(std::sync::atomic::Ordering::Relaxed); + let ignore_cache_since = if ttl_disabled_since_us != u64::MAX { + let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us); + // We are fine if entry is not older than ttl or was added before we are getting notifications. + valid_since = valid_since.min(ignore_cache_since); + Some(ignore_cache_since) + } else { + None + }; + (valid_since, ignore_cache_since) + } + + pub async fn gc_worker(&self) -> anyhow::Result { + let mut interval = + tokio::time::interval(self.config.gc_interval / (self.cache.shards().len()) as u32); + loop { + interval.tick().await; + if self.cache.len() < self.config.size { + // If there are not too many entries, wait until the next gc cycle. + continue; + } + self.gc(); + } + } + + fn gc(&self) { + let shard = thread_rng().gen_range(0..self.project2ep.shards().len()); + debug!(shard, "project_info_cache: performing epoch reclamation"); + + // acquire a random shard lock + let mut removed = 0; + let shard = self.project2ep.shards()[shard].write(); + for (_, endpoints) in shard.iter() { + for endpoint in endpoints.get().iter() { + self.cache.remove(endpoint); + removed += 1; + } + } + // We can drop this shard only after making sure that all endpoints are removed. + drop(shard); + info!("project_info_cache: removed {removed} endpoints"); + } +} + +/// Lookup info for project info cache. +/// This is used to invalidate cache entries. +pub struct CachedLookupInfo { + /// Search by this key. + endpoint_id: EndpointIdInt, + lookup_type: LookupType, +} + +impl CachedLookupInfo { + pub(self) fn new_role_secret(endpoint_id: EndpointIdInt, role_name: RoleNameInt) -> Self { + Self { + endpoint_id, + lookup_type: LookupType::RoleSecret(role_name), + } + } + pub(self) fn new_allowed_ips(endpoint_id: EndpointIdInt) -> Self { + Self { + endpoint_id, + lookup_type: LookupType::AllowedIps, + } + } +} + +enum LookupType { + RoleSecret(RoleNameInt), + AllowedIps, +} + +impl Cache for ProjectInfoCacheImpl { + type Key = SmolStr; + // Value is not really used here, but we need to specify it. + type Value = SmolStr; + + type LookupInfo = CachedLookupInfo; + + fn invalidate(&self, key: &Self::LookupInfo) { + match &key.lookup_type { + LookupType::RoleSecret(role_name) => { + if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { + endpoint_info.invalidate_role_secret(*role_name); + } + } + LookupType::AllowedIps => { + if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) { + endpoint_info.invalidate_allowed_ips(); + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{scram::ServerSecret, ProjectId}; + + #[tokio::test] + async fn test_project_info_cache_settings() { + tokio::time::pause(); + let cache = ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { + size: 2, + max_roles: 2, + ttl: Duration::from_secs(1), + gc_interval: Duration::from_secs(600), + }); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); + let user1: RoleName = "user1".into(); + let user2: RoleName = "user2".into(); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); + let secret2 = None; + let allowed_ips = Arc::new(vec![ + "127.0.0.1".parse().unwrap(), + "127.0.0.2".parse().unwrap(), + ]); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); + + let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); + assert!(cached.cached()); + assert_eq!(cached.value, secret1); + let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap(); + assert!(cached.cached()); + assert_eq!(cached.value, secret2); + + // Shouldn't add more than 2 roles. + let user3: RoleName = "user3".into(); + let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32]))); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user3).into(), + secret3.clone(), + ); + assert!(cache.get_role_secret(&endpoint_id, &user3).is_none()); + + let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); + assert!(cached.cached()); + assert_eq!(cached.value, allowed_ips); + + tokio::time::advance(Duration::from_secs(2)).await; + let cached = cache.get_role_secret(&endpoint_id, &user1); + assert!(cached.is_none()); + let cached = cache.get_role_secret(&endpoint_id, &user2); + assert!(cached.is_none()); + let cached = cache.get_allowed_ips(&endpoint_id); + assert!(cached.is_none()); + } + + #[tokio::test] + async fn test_project_info_cache_invalidations() { + tokio::time::pause(); + let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { + size: 2, + max_roles: 2, + ttl: Duration::from_secs(1), + gc_interval: Duration::from_secs(600), + })); + cache.clone().increment_active_listeners().await; + tokio::time::advance(Duration::from_secs(2)).await; + + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); + let user1: RoleName = "user1".into(); + let user2: RoleName = "user2".into(); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); + let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32]))); + let allowed_ips = Arc::new(vec![ + "127.0.0.1".parse().unwrap(), + "127.0.0.2".parse().unwrap(), + ]); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); + + tokio::time::advance(Duration::from_secs(2)).await; + // Nothing should be invalidated. + + let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); + // TTL is disabled, so it should be impossible to invalidate this value. + assert!(!cached.cached()); + assert_eq!(cached.value, secret1); + + cached.invalidate(); // Shouldn't do anything. + let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); + assert_eq!(cached.value, secret1); + + let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap(); + assert!(!cached.cached()); + assert_eq!(cached.value, secret2); + + // The only way to invalidate this value is to invalidate via the api. + cache.invalidate_role_secret_for_project((&project_id).into(), (&user2).into()); + assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); + + let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); + assert!(!cached.cached()); + assert_eq!(cached.value, allowed_ips); + } + + #[tokio::test] + async fn test_increment_active_listeners_invalidate_added_before() { + tokio::time::pause(); + let cache = Arc::new(ProjectInfoCacheImpl::new(ProjectInfoCacheOptions { + size: 2, + max_roles: 2, + ttl: Duration::from_secs(1), + gc_interval: Duration::from_secs(600), + })); + + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); + let user1: RoleName = "user1".into(); + let user2: RoleName = "user2".into(); + let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); + let secret2 = Some(AuthSecret::Scram(ServerSecret::mock([2; 32]))); + let allowed_ips = Arc::new(vec![ + "127.0.0.1".parse().unwrap(), + "127.0.0.2".parse().unwrap(), + ]); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.clone().increment_active_listeners().await; + tokio::time::advance(Duration::from_millis(100)).await; + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + + // Added before ttl was disabled + ttl should be still cached. + let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); + assert!(cached.cached()); + let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap(); + assert!(cached.cached()); + + tokio::time::advance(Duration::from_secs(1)).await; + // Added before ttl was disabled + ttl should expire. + assert!(cache.get_role_secret(&endpoint_id, &user1).is_none()); + assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); + + // Added after ttl was disabled + ttl should not be cached. + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); + let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); + assert!(!cached.cached()); + + tokio::time::advance(Duration::from_secs(1)).await; + // Added before ttl was disabled + ttl still should expire. + assert!(cache.get_role_secret(&endpoint_id, &user1).is_none()); + assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); + // Shouldn't be invalidated. + + let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); + assert!(!cached.cached()); + assert_eq!(cached.value, allowed_ips); + } +} diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs new file mode 100644 index 0000000000..3b21381bb9 --- /dev/null +++ b/proxy/src/cache/timed_lru.rs @@ -0,0 +1,258 @@ +use std::{ + borrow::Borrow, + hash::Hash, + time::{Duration, Instant}, +}; +use tracing::debug; + +// This seems to make more sense than `lru` or `cached`: +// +// * `near/nearcore` ditched `cached` in favor of `lru` +// (https://github.com/near/nearcore/issues?q=is%3Aissue+lru+is%3Aclosed). +// +// * `lru` methods use an obscure `KeyRef` type in their contraints (which is deliberately excluded from docs). +// This severely hinders its usage both in terms of creating wrappers and supported key types. +// +// On the other hand, `hashlink` has good download stats and appears to be maintained. +use hashlink::{linked_hash_map::RawEntryMut, LruCache}; + +use super::{common::Cached, *}; + +/// An implementation of timed LRU cache with fixed capacity. +/// Key properties: +/// +/// * Whenever a new entry is inserted, the least recently accessed one is evicted. +/// The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`). +/// +/// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp. +/// If the entry has expired, we remove it from the cache; Otherwise we bump the +/// expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong +/// its existence. +/// +/// * There's an API for immediate invalidation (removal) of a cache entry; +/// It's useful in case we know for sure that the entry is no longer correct. +/// See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information. +/// +/// * Expired entries are kept in the cache, until they are evicted by the LRU policy, +/// or by a successful lookup (i.e. the entry hasn't expired yet). +/// There is no background job to reap the expired records. +/// +/// * It's possible for an entry that has not yet expired entry to be evicted +/// before expired items. That's a bit wasteful, but probably fine in practice. +pub struct TimedLru { + /// Cache's name for tracing. + name: &'static str, + + /// The underlying cache implementation. + cache: parking_lot::Mutex>>, + + /// Default time-to-live of a single entry. + ttl: Duration, + + update_ttl_on_retrieval: bool, +} + +impl Cache for TimedLru { + type Key = K; + type Value = V; + type LookupInfo = LookupInfo; + + fn invalidate(&self, info: &Self::LookupInfo) { + self.invalidate_raw(info) + } +} + +struct Entry { + created_at: Instant, + expires_at: Instant, + value: T, +} + +impl TimedLru { + /// Construct a new LRU cache with timed entries. + pub fn new( + name: &'static str, + capacity: usize, + ttl: Duration, + update_ttl_on_retrieval: bool, + ) -> Self { + Self { + name, + cache: LruCache::new(capacity).into(), + ttl, + update_ttl_on_retrieval, + } + } + + /// Drop an entry from the cache if it's outdated. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn invalidate_raw(&self, info: &LookupInfo) { + let now = Instant::now(); + + // Do costly things before taking the lock. + let mut cache = self.cache.lock(); + let raw_entry = match cache.raw_entry_mut().from_key(&info.key) { + RawEntryMut::Vacant(_) => return, + RawEntryMut::Occupied(x) => x, + }; + + // Remove the entry if it was created prior to lookup timestamp. + let entry = raw_entry.get(); + let (created_at, expires_at) = (entry.created_at, entry.expires_at); + let should_remove = created_at <= info.created_at || expires_at <= now; + + if should_remove { + raw_entry.remove(); + } + + drop(cache); // drop lock before logging + debug!( + created_at = format_args!("{created_at:?}"), + expires_at = format_args!("{expires_at:?}"), + entry_removed = should_remove, + "processed a cache entry invalidation event" + ); + } + + /// Try retrieving an entry by its key, then execute `extract` if it exists. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn get_raw(&self, key: &Q, extract: impl FnOnce(&K, &Entry) -> R) -> Option + where + K: Borrow, + Q: Hash + Eq + ?Sized, + { + let now = Instant::now(); + let deadline = now.checked_add(self.ttl).expect("time overflow"); + + // Do costly things before taking the lock. + let mut cache = self.cache.lock(); + let mut raw_entry = match cache.raw_entry_mut().from_key(key) { + RawEntryMut::Vacant(_) => return None, + RawEntryMut::Occupied(x) => x, + }; + + // Immeditely drop the entry if it has expired. + let entry = raw_entry.get(); + if entry.expires_at <= now { + raw_entry.remove(); + return None; + } + + let value = extract(raw_entry.key(), entry); + let (created_at, expires_at) = (entry.created_at, entry.expires_at); + + // Update the deadline and the entry's position in the LRU list. + if self.update_ttl_on_retrieval { + raw_entry.get_mut().expires_at = deadline; + } + raw_entry.to_back(); + + drop(cache); // drop lock before logging + debug!( + created_at = format_args!("{created_at:?}"), + old_expires_at = format_args!("{expires_at:?}"), + new_expires_at = format_args!("{deadline:?}"), + "accessed a cache entry" + ); + + Some(value) + } + + /// Insert an entry to the cache. If an entry with the same key already + /// existed, return the previous value and its creation timestamp. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn insert_raw(&self, key: K, value: V) -> (Instant, Option) { + let created_at = Instant::now(); + let expires_at = created_at.checked_add(self.ttl).expect("time overflow"); + + let entry = Entry { + created_at, + expires_at, + value, + }; + + // Do costly things before taking the lock. + let old = self + .cache + .lock() + .insert(key, entry) + .map(|entry| entry.value); + + debug!( + created_at = format_args!("{created_at:?}"), + expires_at = format_args!("{expires_at:?}"), + replaced = old.is_some(), + "created a cache entry" + ); + + (created_at, old) + } +} + +impl TimedLru { + pub fn insert(&self, key: K, value: V) -> (Option, Cached<&Self>) { + let (created_at, old) = self.insert_raw(key.clone(), value.clone()); + + let cached = Cached { + token: Some((self, LookupInfo { created_at, key })), + value, + }; + + (old, cached) + } +} + +impl TimedLru { + /// Retrieve a cached entry in convenient wrapper. + pub fn get(&self, key: &Q) -> Option> + where + K: Borrow + Clone, + Q: Hash + Eq + ?Sized, + { + self.get_raw(key, |key, entry| { + let info = LookupInfo { + created_at: entry.created_at, + key: key.clone(), + }; + + Cached { + token: Some((self, info)), + value: entry.value.clone(), + } + }) + } + + /// Retrieve a cached entry in convenient wrapper, ignoring its TTL. + pub fn get_ignoring_ttl(&self, key: &Q) -> Option> + where + K: Borrow, + Q: Hash + Eq + ?Sized, + { + let mut cache = self.cache.lock(); + cache + .get(key) + .map(|entry| Cached::new_uncached(entry.value.clone())) + } + + /// Remove an entry from the cache. + pub fn remove(&self, key: &Q) -> Option + where + K: Borrow + Clone, + Q: Hash + Eq + ?Sized, + { + let mut cache = self.cache.lock(); + cache.remove(key).map(|entry| entry.value) + } +} + +/// Lookup information for key invalidation. +pub struct LookupInfo { + /// Time of creation of a cache [`Entry`]. + /// We use this during invalidation lookups to prevent eviction of a newer + /// entry sharing the same key (it might've been inserted by a different + /// task after we got the entry we're trying to invalidate now). + created_at: Instant, + + /// Search by this key. + key: K, +} diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index a5eb3544b4..34512e9f5b 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,73 +1,147 @@ -use anyhow::{bail, Context}; use dashmap::DashMap; use pq_proto::CancelKeyData; -use std::net::SocketAddr; +use std::{net::SocketAddr, sync::Arc}; +use thiserror::Error; use tokio::net::TcpStream; +use tokio::sync::Mutex; use tokio_postgres::{CancelToken, NoTls}; use tracing::info; +use uuid::Uuid; + +use crate::{ + error::ReportableError, + metrics::{CancellationRequest, CancellationSource, Metrics}, + redis::cancellation_publisher::{ + CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, + }, +}; + +pub type CancelMap = Arc>>; +pub type CancellationHandlerMain = CancellationHandler>>>; +pub type CancellationHandlerMainInternal = Option>>; /// Enables serving `CancelRequest`s. -#[derive(Default)] -pub struct CancelMap(DashMap>); +/// +/// If `CancellationPublisher` is available, cancel request will be used to publish the cancellation key to other proxy instances. +pub struct CancellationHandler

{ + map: CancelMap, + client: P, + /// This field used for the monitoring purposes. + /// Represents the source of the cancellation request. + from: CancellationSource, +} -impl CancelMap { - /// Cancel a running query for the corresponding connection. - pub async fn cancel_session(&self, key: CancelKeyData) -> anyhow::Result<()> { - // NB: we should immediately release the lock after cloning the token. - let cancel_closure = self - .0 - .get(&key) - .and_then(|x| x.clone()) - .with_context(|| format!("query cancellation key not found: {key}"))?; +#[derive(Debug, Error)] +pub enum CancelError { + #[error("{0}")] + IO(#[from] std::io::Error), + #[error("{0}")] + Postgres(#[from] tokio_postgres::Error), +} - info!("cancelling query per user's request using key {key}"); - cancel_closure.try_cancel_query().await +impl ReportableError for CancelError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + CancelError::IO(_) => crate::error::ErrorKind::Compute, + CancelError::Postgres(e) if e.as_db_error().is_some() => { + crate::error::ErrorKind::Postgres + } + CancelError::Postgres(_) => crate::error::ErrorKind::Compute, + } } +} +impl CancellationHandler

{ /// Run async action within an ephemeral session identified by [`CancelKeyData`]. - pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result - where - F: FnOnce(Session<'a>) -> R, - R: std::future::Future>, - { + pub fn get_session(self: Arc) -> Session

{ // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't // expose it and we don't want to do another roundtrip to query // for it. The client will be able to notice that this is not the // actual backend_pid, but backend_pid is not used for anything // so it doesn't matter. - let key = rand::random(); + let key = loop { + let key = rand::random(); - // Random key collisions are unlikely to happen here, but they're still possible, - // which is why we have to take care not to rewrite an existing key. - match self.0.entry(key) { - dashmap::mapref::entry::Entry::Occupied(_) => { - bail!("query cancellation key already exists: {key}") + // Random key collisions are unlikely to happen here, but they're still possible, + // which is why we have to take care not to rewrite an existing key. + match self.map.entry(key) { + dashmap::mapref::entry::Entry::Occupied(_) => continue, + dashmap::mapref::entry::Entry::Vacant(e) => { + e.insert(None); + } } - dashmap::mapref::entry::Entry::Vacant(e) => { - e.insert(None); - } - } - - // This will guarantee that the session gets dropped - // as soon as the future is finished. - scopeguard::defer! { - self.0.remove(&key); - info!("dropped query cancellation key {key}"); - } + break key; + }; info!("registered new query cancellation key {key}"); - let session = Session::new(key, self); - f(session).await + Session { + key, + cancellation_handler: self, + } + } + /// Try to cancel a running query for the corresponding connection. + /// If the cancellation key is not found, it will be published to Redis. + pub async fn cancel_session( + &self, + key: CancelKeyData, + session_id: Uuid, + ) -> Result<(), CancelError> { + // NB: we should immediately release the lock after cloning the token. + let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else { + tracing::warn!("query cancellation key not found: {key}"); + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::NotFound, + }); + match self.client.try_publish(key, session_id).await { + Ok(()) => {} // do nothing + Err(e) => { + return Err(CancelError::IO(std::io::Error::new( + std::io::ErrorKind::Other, + e.to_string(), + ))); + } + } + return Ok(()); + }; + Metrics::get() + .proxy + .cancellation_requests_total + .inc(CancellationRequest { + source: self.from, + kind: crate::metrics::CancellationOutcome::Found, + }); + info!("cancelling query per user's request using key {key}"); + cancel_closure.try_cancel_query().await } #[cfg(test)] - fn contains(&self, session: &Session) -> bool { - self.0.contains_key(&session.key) + fn contains(&self, session: &Session

) -> bool { + self.map.contains_key(&session.key) } #[cfg(test)] fn is_empty(&self) -> bool { - self.0.is_empty() + self.map.is_empty() + } +} + +impl CancellationHandler<()> { + pub fn new(map: CancelMap, from: CancellationSource) -> Self { + Self { + map, + client: (), + from, + } + } +} + +impl CancellationHandler>>> { + pub fn new(map: CancelMap, client: Option>>, from: CancellationSource) -> Self { + Self { map, client, from } } } @@ -87,73 +161,75 @@ impl CancelClosure { cancel_token, } } - /// Cancels the query running on user's compute node. - pub async fn try_cancel_query(self) -> anyhow::Result<()> { + pub async fn try_cancel_query(self) -> Result<(), CancelError> { let socket = TcpStream::connect(self.socket_addr).await?; self.cancel_token.cancel_query_raw(socket, NoTls).await?; - + info!("query was cancelled"); Ok(()) } } /// Helper for registering query cancellation tokens. -pub struct Session<'a> { +pub struct Session

{ /// The user-facing key identifying this session. key: CancelKeyData, /// The [`CancelMap`] this session belongs to. - cancel_map: &'a CancelMap, + cancellation_handler: Arc>, } -impl<'a> Session<'a> { - fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self { - Self { key, cancel_map } +impl

Session

{ + /// Store the cancel token for the given session. + /// This enables query cancellation in `crate::proxy::prepare_client_connection`. + pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData { + info!("enabling query cancellation for this session"); + self.cancellation_handler + .map + .insert(self.key, Some(cancel_closure)); + + self.key } } -impl Session<'_> { - /// Store the cancel token for the given session. - /// This enables query cancellation in `crate::proxy::prepare_client_connection`. - pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData { - info!("enabling query cancellation for this session"); - self.cancel_map.0.insert(self.key, Some(cancel_closure)); - - self.key +impl

Drop for Session

{ + fn drop(&mut self) { + self.cancellation_handler.map.remove(&self.key); + info!("dropped query cancellation key {}", &self.key); } } #[cfg(test)] mod tests { use super::*; - use once_cell::sync::Lazy; #[tokio::test] async fn check_session_drop() -> anyhow::Result<()> { - static CANCEL_MAP: Lazy = Lazy::new(Default::default); - - let (tx, rx) = tokio::sync::oneshot::channel(); - let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move { - assert!(CANCEL_MAP.contains(&session)); - - tx.send(()).expect("failed to send"); - futures::future::pending::<()>().await; // sleep forever - - Ok(()) - })); - - // Wait until the task has been spawned. - rx.await.context("failed to hear from the task")?; - - // Drop the session's entry by cancelling the task. - task.abort(); - let error = task.await.expect_err("task should have failed"); - if !error.is_cancelled() { - anyhow::bail!(error); - } + let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + CancelMap::default(), + CancellationSource::FromRedis, + )); + let session = cancellation_handler.clone().get_session(); + assert!(cancellation_handler.contains(&session)); + drop(session); // Check that the session has been dropped. - assert!(CANCEL_MAP.is_empty()); + assert!(cancellation_handler.is_empty()); Ok(()) } + + #[tokio::test] + async fn cancel_session_noop_regression() { + let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local); + handler + .cancel_session( + CancelKeyData { + backend_pid: 0, + cancel_key: 0, + }, + Uuid::new_v4(), + ) + .await + .unwrap(); + } } diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 9a5abe2960..feb09d5638 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,16 +1,23 @@ use crate::{ - auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError, - context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, + auth::parse_endpoint_param, + cancellation::CancelClosure, + console::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError}, + context::RequestMonitoring, + error::{ReportableError, UserFacingError}, + metrics::{Metrics, NumDbConnectionsGuard}, proxy::neon_option, + Host, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use metrics::IntCounterPairGuard; +use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; -use std::{io, net::SocketAddr, time::Duration}; +use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError}; +use std::{io, net::SocketAddr, sync::Arc, time::Duration}; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; +use tokio_postgres_rustls::MakeRustlsConnect; use tracing::{error, info, warn}; const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; @@ -26,10 +33,13 @@ pub enum ConnectionError { CouldNotConnect(#[from] io::Error), #[error("{COULD_NOT_CONNECT}: {0}")] - TlsError(#[from] native_tls::Error), + TlsError(#[from] InvalidDnsNameError), #[error("{COULD_NOT_CONNECT}: {0}")] WakeComputeError(#[from] WakeComputeError), + + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } impl UserFacingError for ConnectionError { @@ -39,33 +49,60 @@ impl UserFacingError for ConnectionError { // This helps us drop irrelevant library-specific prefixes. // TODO: propagate severity level and other parameters. Postgres(err) => match err.as_db_error() { - Some(err) => err.message().to_owned(), + Some(err) => { + let msg = err.message(); + + if msg.starts_with("unsupported startup parameter: ") + || msg.starts_with("unsupported startup parameter in options: ") + { + format!("{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter") + } else { + msg.to_owned() + } + } None => err.to_string(), }, WakeComputeError(err) => err.to_string_client(), + TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } _ => COULD_NOT_CONNECT.to_owned(), } } } +impl ReportableError for ConnectionError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ConnectionError::Postgres(e) if e.as_db_error().is_some() => { + crate::error::ErrorKind::Postgres + } + ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, + ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute, + ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, + ConnectionError::WakeComputeError(e) => e.get_error_kind(), + ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), + } + } +} + /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. pub type ScramKeys = tokio_postgres::config::ScramKeys<32>; /// A config for establishing a connection to compute node. /// Eventually, `tokio_postgres` will be replaced with something better. /// Newtype allows us to implement methods on top of it. -#[derive(Clone)] -#[repr(transparent)] +#[derive(Clone, Default)] pub struct ConnCfg(Box); /// Creation and initialization routines. impl ConnCfg { pub fn new() -> Self { - Self(Default::default()) + Self::default() } /// Reuse password or auth keys from the other config. - pub fn reuse_password(&mut self, other: &Self) { + pub fn reuse_password(&mut self, other: Self) { if let Some(password) = other.get_password() { self.password(password); } @@ -75,6 +112,16 @@ impl ConnCfg { } } + pub fn get_host(&self) -> Result { + match self.0.get_hosts() { + [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()), + // we should not have multiple address or unix addresses. + _ => Err(WakeComputeError::BadComputeAddress( + "invalid compute address".into(), + )), + } + } + /// Apply startup message params to the connection config. pub fn set_startup_params(&mut self, params: &StartupMessageParams) { // Only set `user` if it's not present in the config. @@ -137,12 +184,6 @@ impl std::ops::DerefMut for ConnCfg { } } -impl Default for ConnCfg { - fn default() -> Self { - Self::new() - } -} - impl ConnCfg { /// Establish a raw TCP connection to the compute node. async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> { @@ -219,14 +260,16 @@ pub struct PostgresConnection { /// Socket connected to a compute node. pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream< tokio::net::TcpStream, - postgres_native_tls::TlsStream, + tokio_postgres_rustls::RustlsStream, >, /// PostgreSQL connection parameters. pub params: std::collections::HashMap, /// Query cancellation token. pub cancel_closure: CancelClosure, + /// Labels for proxy's metrics. + pub aux: MetricsAuxInfo, - _guage: IntCounterPairGuard, + _guage: NumDbConnectionsGuard<'static>, } impl ConnCfg { @@ -235,23 +278,40 @@ impl ConnCfg { &self, ctx: &mut RequestMonitoring, allow_self_signed_compute: bool, + aux: MetricsAuxInfo, timeout: Duration, ) -> Result { + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); let (socket_addr, stream, host) = self.connect_raw(timeout).await?; + drop(pause); - let tls_connector = native_tls::TlsConnector::builder() - .danger_accept_invalid_certs(allow_self_signed_compute) - .build() - .unwrap(); - let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector); - let tls = MakeTlsConnect::::make_tls_connect(&mut mk_tls, host)?; + let client_config = if allow_self_signed_compute { + // Allow all certificates for creating the connection + let verifier = Arc::new(AcceptEverythingVerifier) as Arc; + rustls::ClientConfig::builder() + .dangerous() + .with_custom_certificate_verifier(verifier) + } else { + let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); + rustls::ClientConfig::builder().with_root_certificates(root_store) + }; + let client_config = client_config.with_no_client_auth(); + + let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); + let tls = >::make_tls_connect( + &mut mk_tls, + host, + )?; // connect_raw() will not use TLS if sslmode is "disable" + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); let (client, connection) = self.0.connect_raw(stream, tls).await?; + drop(pause); tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); let stream = connection.stream.into_inner(); info!( + cold_start_info = ctx.cold_start_info.as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", self.0.get_ssl_mode() ); @@ -269,9 +329,8 @@ impl ConnCfg { stream, params, cancel_closure, - _guage: NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(), + aux, + _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol), }; Ok(connection) @@ -295,6 +354,58 @@ fn filtered_options(params: &StartupMessageParams) -> Option { Some(options) } +fn load_certs() -> Result, io::Error> { + let der_certs = rustls_native_certs::load_native_certs()?; + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(der_certs); + Ok(Arc::new(store)) +} +static TLS_ROOTS: OnceCell> = OnceCell::new(); + +#[derive(Debug)] +struct AcceptEverythingVerifier; +impl ServerCertVerifier for AcceptEverythingVerifier { + fn supported_verify_schemes(&self) -> Vec { + use rustls::SignatureScheme::*; + // The schemes for which `SignatureScheme::supported_in_tls13` returns true. + vec![ + ECDSA_NISTP521_SHA512, + ECDSA_NISTP384_SHA384, + ECDSA_NISTP256_SHA256, + RSA_PSS_SHA512, + RSA_PSS_SHA384, + RSA_PSS_SHA256, + ED25519, + ] + } + fn verify_server_cert( + &self, + _end_entity: &rustls::pki_types::CertificateDer<'_>, + _intermediates: &[rustls::pki_types::CertificateDer<'_>], + _server_name: &rustls::pki_types::ServerName<'_>, + _ocsp_response: &[u8], + _now: rustls::pki_types::UnixTime, + ) -> Result { + Ok(rustls::client::danger::ServerCertVerified::assertion()) + } + fn verify_tls12_signature( + &self, + _message: &[u8], + _cert: &rustls::pki_types::CertificateDer<'_>, + _dss: &rustls::DigitallySignedStruct, + ) -> Result { + Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) + } + fn verify_tls13_signature( + &self, + _message: &[u8], + _cert: &rustls::pki_types::CertificateDer<'_>, + _dss: &rustls::DigitallySignedStruct, + ) -> Result { + Ok(rustls::client::danger::HandshakeSignatureValid::assertion()) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 90956f84d3..f4707a33aa 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,6 +1,18 @@ -use crate::{auth, rate_limiter::RateBucketInfo, serverless::GlobalConnPoolOptions}; +use crate::{ + auth::{self, backend::AuthRateLimiter}, + console::locks::ApiLocks, + rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, + scram::threadpool::ThreadPool, + serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, + Host, +}; use anyhow::{bail, ensure, Context, Ok}; -use rustls::{sign, Certificate, PrivateKey}; +use itertools::Itertools; +use remote_storage::RemoteStorageConfig; +use rustls::{ + crypto::ring::sign, + pki_types::{CertificateDer, PrivateKeyDer}, +}; use sha2::{Digest, Sha256}; use std::{ collections::{HashMap, HashSet}, @@ -13,36 +25,48 @@ use x509_parser::oid_registry; pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::BackendType<'static, ()>, + pub auth_backend: auth::BackendType<'static, (), ()>, pub metric_collection: Option, pub allow_self_signed_compute: bool, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, pub disable_ip_check_for_http: bool, - pub endpoint_rps_limit: Vec, + pub redis_rps_limit: Vec, pub region: String, + pub handshake_timeout: Duration, + pub aws_region: String, + pub wake_compute_retry_config: RetryConfig, + pub connect_compute_locks: ApiLocks, + pub connect_to_compute_retry_config: RetryConfig, } #[derive(Debug)] pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, pub interval: Duration, + pub backup_metric_collection_config: MetricBackupCollectionConfig, } pub struct TlsConfig { pub config: Arc, - pub common_names: Option>, + pub common_names: HashSet, pub cert_resolver: Arc, } pub struct HttpConfig { pub request_timeout: tokio::time::Duration, pub pool_options: GlobalConnPoolOptions, + pub cancel_set: CancelSet, + pub client_conn_threshold: u64, } pub struct AuthenticationConfig { + pub thread_pool: Arc, pub scram_protocol_timeout: tokio::time::Duration, + pub rate_limiter_enabled: bool, + pub rate_limiter: AuthRateLimiter, + pub rate_limit_ip_subnet: u8, } impl TlsConfig { @@ -86,18 +110,18 @@ pub fn configure_tls( let cert_resolver = Arc::new(cert_resolver); - let config = rustls::ServerConfig::builder() - .with_safe_default_cipher_suites() - .with_safe_default_kx_groups() - // allow TLS 1.2 to be compatible with older client libraries - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])? - .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()) - .into(); + // allow TLS 1.2 to be compatible with older client libraries + let config = rustls::ServerConfig::builder_with_protocol_versions(&[ + &rustls::version::TLS13, + &rustls::version::TLS12, + ]) + .with_no_client_auth() + .with_cert_resolver(cert_resolver.clone()) + .into(); Ok(TlsConfig { config, - common_names: Some(common_names), + common_names, cert_resolver, }) } @@ -131,14 +155,14 @@ pub enum TlsServerEndPoint { } impl TlsServerEndPoint { - pub fn new(cert: &Certificate) -> anyhow::Result { + pub fn new(cert: &CertificateDer) -> anyhow::Result { let sha256_oids = [ // I'm explicitly not adding MD5 or SHA1 here... They're bad. oid_registry::OID_SIG_ECDSA_WITH_SHA256, oid_registry::OID_PKCS1_SHA256WITHRSA, ]; - let pem = x509_parser::parse_x509_certificate(&cert.0) + let pem = x509_parser::parse_x509_certificate(cert) .context("Failed to parse PEM object from cerficiate")? .1; @@ -148,8 +172,7 @@ impl TlsServerEndPoint { let oid = pem.signature_algorithm.oid(); let alg = reg.get(oid); if sha256_oids.contains(oid) { - let tls_server_end_point: [u8; 32] = - Sha256::new().chain_update(&cert.0).finalize().into(); + let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); Ok(Self::Sha256(tls_server_end_point)) } else { @@ -163,7 +186,7 @@ impl TlsServerEndPoint { } } -#[derive(Default)] +#[derive(Default, Debug)] pub struct CertResolver { certs: HashMap, TlsServerEndPoint)>, default: Option<(Arc, TlsServerEndPoint)>, @@ -183,11 +206,14 @@ impl CertResolver { let priv_key = { let key_bytes = std::fs::read(key_path) .context(format!("Failed to read TLS keys at '{key_path}'"))?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to parse TLS keys at '{key_path}'"))?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() + PrivateKeyDer::Pkcs8( + keys.pop() + .unwrap() + .context(format!("Failed to parse TLS keys at '{key_path}'"))?, + ) }; let cert_chain_bytes = std::fs::read(cert_path) @@ -195,14 +221,10 @@ impl CertResolver { let cert_chain = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() .with_context(|| { - format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ) + format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") })? - .into_iter() - .map(rustls::Certificate) - .collect() }; self.add_cert(priv_key, cert_chain, is_default) @@ -210,15 +232,15 @@ impl CertResolver { pub fn add_cert( &mut self, - priv_key: PrivateKey, - cert_chain: Vec, + priv_key: PrivateKeyDer<'static>, + cert_chain: Vec>, is_default: bool, ) -> anyhow::Result<()> { let key = sign::any_supported_type(&priv_key).context("invalid private key")?; let first_cert = &cert_chain[0]; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let pem = x509_parser::parse_x509_certificate(&first_cert.0) + let pem = x509_parser::parse_x509_certificate(first_cert) .context("Failed to parse PEM object from cerficiate")? .1; @@ -300,6 +322,95 @@ impl CertResolver { } } +#[derive(Debug)] +pub struct EndpointCacheConfig { + /// Batch size to receive all endpoints on the startup. + pub initial_batch_size: usize, + /// Batch size to receive endpoints. + pub default_batch_size: usize, + /// Timeouts for the stream read operation. + pub xread_timeout: Duration, + /// Stream name to read from. + pub stream_name: String, + /// Limiter info (to distinguish when to enable cache). + pub limiter_info: Vec, + /// Disable cache. + /// If true, cache is ignored, but reports all statistics. + pub disable_cache: bool, + /// Retry interval for the stream read operation. + pub retry_interval: Duration, +} + +impl EndpointCacheConfig { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + /// Notice that by default the limiter is empty, which means that cache is disabled. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut initial_batch_size = None; + let mut default_batch_size = None; + let mut xread_timeout = None; + let mut stream_name = None; + let mut limiter_info = vec![]; + let mut disable_cache = false; + let mut retry_interval = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "initial_batch_size" => initial_batch_size = Some(value.parse()?), + "default_batch_size" => default_batch_size = Some(value.parse()?), + "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?), + "stream_name" => stream_name = Some(value.to_string()), + "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?), + "disable_cache" => disable_cache = value.parse()?, + "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + RateBucketInfo::validate(&mut limiter_info)?; + + Ok(Self { + initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?, + default_batch_size: default_batch_size.context("missing `default_batch_size`")?, + xread_timeout: xread_timeout.context("missing `xread_timeout`")?, + stream_name: stream_name.context("missing `stream_name`")?, + disable_cache, + limiter_info, + retry_interval: retry_interval.context("missing `retry_interval`")?, + }) + } +} + +impl FromStr for EndpointCacheConfig { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse endpoint cache options '{options}'"); + Self::parse(options).with_context(error) + } +} +#[derive(Debug)] +pub struct MetricBackupCollectionConfig { + pub interval: Duration, + pub remote_storage_config: OptRemoteStorageConfig, + pub chunk_size: usize, +} + +/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get +/// runtime type errors from the value parser we use. +pub type OptRemoteStorageConfig = Option; + +pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { + RemoteStorageConfig::from_toml(&s.parse()?) +} + /// Helper for cmdline cache options parsing. #[derive(Debug)] pub struct CacheOptions { @@ -353,26 +464,154 @@ impl FromStr for CacheOptions { } /// Helper for cmdline cache options parsing. -pub struct WakeComputeLockOptions { +#[derive(Debug)] +pub struct ProjectInfoCacheOptions { + /// Max number of entries. + pub size: usize, + /// Entry's time-to-live. + pub ttl: Duration, + /// Max number of roles per endpoint. + pub max_roles: usize, + /// Gc interval. + pub gc_interval: Duration, +} + +impl ProjectInfoCacheOptions { + /// Default options for [`crate::console::provider::NodeInfoCache`]. + pub const CACHE_DEFAULT_OPTIONS: &'static str = + "size=10000,ttl=4m,max_roles=10,gc_interval=60m"; + + /// Parse cache options passed via cmdline. + /// Example: [`Self::CACHE_DEFAULT_OPTIONS`]. + fn parse(options: &str) -> anyhow::Result { + let mut size = None; + let mut ttl = None; + let mut max_roles = None; + let mut gc_interval = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "size" => size = Some(value.parse()?), + "ttl" => ttl = Some(humantime::parse_duration(value)?), + "max_roles" => max_roles = Some(value.parse()?), + "gc_interval" => gc_interval = Some(humantime::parse_duration(value)?), + unknown => bail!("unknown key: {unknown}"), + } + } + + // TTL doesn't matter if cache is always empty. + if let Some(0) = size { + ttl.get_or_insert(Duration::default()); + } + + Ok(Self { + size: size.context("missing `size`")?, + ttl: ttl.context("missing `ttl`")?, + max_roles: max_roles.context("missing `max_roles`")?, + gc_interval: gc_interval.context("missing `gc_interval`")?, + }) + } +} + +impl FromStr for ProjectInfoCacheOptions { + type Err = anyhow::Error; + + fn from_str(options: &str) -> Result { + let error = || format!("failed to parse cache options '{options}'"); + Self::parse(options).with_context(error) + } +} + +/// This is a config for connect to compute and wake compute. +#[derive(Clone, Copy, Debug)] +pub struct RetryConfig { + /// Number of times we should retry. + pub max_retries: u32, + /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0 + pub base_delay: tokio::time::Duration, + /// Exponential base for retry wait duration + pub backoff_factor: f64, +} + +impl RetryConfig { + /// Default options for RetryConfig. + + /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s. + pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str = + "num_retries=5,base_retry_wait_duration=200ms,retry_wait_exponent_base=2"; + /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s. + /// Cplane has timeout of 60s on each request. 8m7s in total. + pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str = + "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6"; + + /// Parse retry options passed via cmdline. + /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`]. + pub fn parse(options: &str) -> anyhow::Result { + let mut num_retries = None; + let mut base_retry_wait_duration = None; + let mut retry_wait_exponent_base = None; + + for option in options.split(',') { + let (key, value) = option + .split_once('=') + .with_context(|| format!("bad key-value pair: {option}"))?; + + match key { + "num_retries" => num_retries = Some(value.parse()?), + "base_retry_wait_duration" => { + base_retry_wait_duration = Some(humantime::parse_duration(value)?) + } + "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?), + unknown => bail!("unknown key: {unknown}"), + } + } + + Ok(Self { + max_retries: num_retries.context("missing `num_retries`")?, + base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?, + backoff_factor: retry_wait_exponent_base + .context("missing `retry_wait_exponent_base`")?, + }) + } +} + +/// Helper for cmdline cache options parsing. +#[derive(serde::Deserialize)] +pub struct ConcurrencyLockOptions { /// The number of shards the lock map should have pub shards: usize, /// The number of allowed concurrent requests for each endpoitn - pub permits: usize, + #[serde(flatten)] + pub limiter: RateLimiterConfig, /// Garbage collection epoch + #[serde(deserialize_with = "humantime_serde::deserialize")] pub epoch: Duration, /// Lock timeout + #[serde(deserialize_with = "humantime_serde::deserialize")] pub timeout: Duration, } -impl WakeComputeLockOptions { +impl ConcurrencyLockOptions { /// Default options for [`crate::console::provider::ApiLocks`]. pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0"; + /// Default options for [`crate::console::provider::ApiLocks`]. + pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str = + "shards=64,permits=100,epoch=10m,timeout=10ms"; // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s"; /// Parse lock options passed via cmdline. /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`]. fn parse(options: &str) -> anyhow::Result { + let options = options.trim(); + if options.starts_with('{') && options.ends_with('}') { + return Ok(serde_json::from_str(options)?); + } + let mut shards = None; let mut permits = None; let mut epoch = None; @@ -399,9 +638,13 @@ impl WakeComputeLockOptions { shards = Some(2); } + let permits = permits.context("missing `permits`")?; let out = Self { shards: shards.context("missing `shards`")?, - permits: permits.context("missing `permits`")?, + limiter: RateLimiterConfig { + algorithm: RateLimitAlgorithm::Fixed, + initial_limit: permits, + }, epoch: epoch.context("missing `epoch`")?, timeout: timeout.context("missing `timeout`")?, }; @@ -416,7 +659,7 @@ impl WakeComputeLockOptions { } } -impl FromStr for WakeComputeLockOptions { +impl FromStr for ConcurrencyLockOptions { type Err = anyhow::Error; fn from_str(options: &str) -> Result { @@ -427,6 +670,8 @@ impl FromStr for WakeComputeLockOptions { #[cfg(test)] mod tests { + use crate::rate_limiter::Aimd; + use super::*; #[test] @@ -452,38 +697,70 @@ mod tests { #[test] fn test_parse_lock_options() -> anyhow::Result<()> { - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, - permits, + limiter, shards, timeout, } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?; assert_eq!(epoch, Duration::from_secs(10 * 60)); assert_eq!(timeout, Duration::from_secs(1)); assert_eq!(shards, 32); - assert_eq!(permits, 4); + assert_eq!(limiter.initial_limit, 4); + assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed); - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, - permits, + limiter, shards, timeout, } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?; assert_eq!(epoch, Duration::from_secs(60)); assert_eq!(timeout, Duration::from_millis(100)); assert_eq!(shards, 16); - assert_eq!(permits, 8); + assert_eq!(limiter.initial_limit, 8); + assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed); - let WakeComputeLockOptions { + let ConcurrencyLockOptions { epoch, - permits, + limiter, shards, timeout, } = "permits=0".parse()?; assert_eq!(epoch, Duration::ZERO); assert_eq!(timeout, Duration::ZERO); assert_eq!(shards, 2); - assert_eq!(permits, 0); + assert_eq!(limiter.initial_limit, 0); + assert_eq!(limiter.algorithm, RateLimitAlgorithm::Fixed); + + Ok(()) + } + + #[test] + fn test_parse_json_lock_options() -> anyhow::Result<()> { + let ConcurrencyLockOptions { + epoch, + limiter, + shards, + timeout, + } = r#"{"shards":32,"initial_limit":44,"aimd":{"min":5,"max":500,"inc":10,"dec":0.9,"utilisation":0.8},"epoch":"10m","timeout":"1s"}"# + .parse()?; + assert_eq!(epoch, Duration::from_secs(10 * 60)); + assert_eq!(timeout, Duration::from_secs(1)); + assert_eq!(shards, 32); + assert_eq!(limiter.initial_limit, 44); + assert_eq!( + limiter.algorithm, + RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 5, + max: 500, + dec: 0.9, + inc: 10, + utilisation: 0.8 + } + }, + ); Ok(()) } diff --git a/proxy/src/console.rs b/proxy/src/console.rs index 07bc807950..ea95e83437 100644 --- a/proxy/src/console.rs +++ b/proxy/src/console.rs @@ -6,7 +6,7 @@ pub mod messages; /// Wrappers for console APIs and their mocks. pub mod provider; -pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo}; +pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo}; /// Various cache-related types. pub mod caches { diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 837379b21f..3b7d681a41 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -1,12 +1,183 @@ -use serde::Deserialize; -use smol_str::SmolStr; -use std::fmt; +use measured::FixedCardinalityLabel; +use serde::{Deserialize, Serialize}; +use std::fmt::{self, Display}; + +use crate::auth::IpPattern; + +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; +use crate::proxy::retry::ShouldRetry; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. #[derive(Debug, Deserialize)] pub struct ConsoleError { pub error: Box, + #[serde(skip)] + pub http_status_code: http::StatusCode, + pub status: Option, +} + +impl ConsoleError { + pub fn get_reason(&self) -> Reason { + self.status + .as_ref() + .and_then(|s| s.details.error_info.as_ref()) + .map(|e| e.reason) + .unwrap_or(Reason::Unknown) + } + pub fn get_user_facing_message(&self) -> String { + use super::provider::errors::REQUEST_FAILED; + self.status + .as_ref() + .and_then(|s| s.details.user_facing_message.as_ref()) + .map(|m| m.message.clone().into()) + .unwrap_or_else(|| { + // Ask @neondatabase/control-plane for review before adding more. + match self.http_status_code { + http::StatusCode::NOT_FOUND => { + // Status 404: failed to get a project-related resource. + format!("{REQUEST_FAILED}: endpoint cannot be found") + } + http::StatusCode::NOT_ACCEPTABLE => { + // Status 406: endpoint is disabled (we don't allow connections). + format!("{REQUEST_FAILED}: endpoint is disabled") + } + http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => { + // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. + format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.") + } + _ => REQUEST_FAILED.to_owned(), + } + }) + } +} + +impl Display for ConsoleError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let msg = self + .status + .as_ref() + .and_then(|s| s.details.user_facing_message.as_ref()) + .map(|m| m.message.as_ref()) + .unwrap_or_else(|| &self.error); + write!(f, "{}", msg) + } +} + +impl ShouldRetry for ConsoleError { + fn could_retry(&self) -> bool { + if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() { + // retry some temporary failures because the compute was in a bad state + // (bad request can be returned when the endpoint was in transition) + return match &self { + ConsoleError { + http_status_code: http::StatusCode::BAD_REQUEST, + .. + } => true, + // don't retry when quotas are exceeded + ConsoleError { + http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY, + ref error, + .. + } => !error.contains("compute time quota of non-primary branches is exceeded"), + // locked can be returned when the endpoint was in transition + // or when quotas are exceeded. don't retry when quotas are exceeded + ConsoleError { + http_status_code: http::StatusCode::LOCKED, + ref error, + .. + } => { + !error.contains("quota exceeded") + && !error.contains("the limit for current plan reached") + } + _ => false, + }; + } + + // retry if the response has a retry delay + if let Some(retry_info) = self + .status + .as_ref() + .and_then(|s| s.details.retry_info.as_ref()) + { + retry_info.retry_delay_ms > 0 + } else { + false + } + } +} + +#[derive(Debug, Deserialize)] +pub struct Status { + pub code: Box, + pub message: Box, + pub details: Details, +} + +#[derive(Debug, Deserialize)] +pub struct Details { + pub error_info: Option, + pub retry_info: Option, + pub user_facing_message: Option, +} + +#[derive(Debug, Deserialize)] +pub struct ErrorInfo { + pub reason: Reason, + // Schema could also have `metadata` field, but it's not structured. Skip it for now. +} + +#[derive(Clone, Copy, Debug, Deserialize, Default)] +pub enum Reason { + #[serde(rename = "ROLE_PROTECTED")] + RoleProtected, + #[serde(rename = "RESOURCE_NOT_FOUND")] + ResourceNotFound, + #[serde(rename = "PROJECT_NOT_FOUND")] + ProjectNotFound, + #[serde(rename = "ENDPOINT_NOT_FOUND")] + EndpointNotFound, + #[serde(rename = "BRANCH_NOT_FOUND")] + BranchNotFound, + #[serde(rename = "RATE_LIMIT_EXCEEDED")] + RateLimitExceeded, + #[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")] + NonPrimaryBranchComputeTimeExceeded, + #[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")] + ActiveTimeQuotaExceeded, + #[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")] + ComputeTimeQuotaExceeded, + #[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")] + WrittenDataQuotaExceeded, + #[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")] + DataTransferQuotaExceeded, + #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")] + LogicalSizeQuotaExceeded, + #[default] + #[serde(other)] + Unknown, +} + +impl Reason { + pub fn is_not_found(&self) -> bool { + matches!( + self, + Reason::ResourceNotFound + | Reason::ProjectNotFound + | Reason::EndpointNotFound + | Reason::BranchNotFound + ) + } +} + +#[derive(Debug, Deserialize)] +pub struct RetryInfo { + pub retry_delay_ms: u64, +} + +#[derive(Debug, Deserialize)] +pub struct UserFacingMessage { + pub message: Box, } /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. @@ -14,7 +185,8 @@ pub struct ConsoleError { #[derive(Deserialize)] pub struct GetRoleSecret { pub role_secret: Box, - pub allowed_ips: Option>>, + pub allowed_ips: Option>, + pub project_id: Option, } // Manually implement debug to omit sensitive info. @@ -89,35 +261,48 @@ impl fmt::Debug for DatabaseInfo { /// Various labels for prometheus metrics. /// Also known as `ProxyMetricsAuxInfo` in the console. -#[derive(Debug, Deserialize, Clone, Default)] +#[derive(Debug, Deserialize, Clone)] pub struct MetricsAuxInfo { - pub endpoint_id: SmolStr, - pub project_id: SmolStr, - pub branch_id: SmolStr, + pub endpoint_id: EndpointIdInt, + pub project_id: ProjectIdInt, + pub branch_id: BranchIdInt, + #[serde(default)] + pub cold_start_info: ColdStartInfo, } -impl MetricsAuxInfo { - /// Definitions of labels for traffic metric. - pub const TRAFFIC_LABELS: &'static [&'static str] = &[ - // Received (rx) / sent (tx). - "direction", - // ID of a project. - "project_id", - // ID of an endpoint within a project. - "endpoint_id", - // ID of a branch within a project (snapshot). - "branch_id", - ]; +#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)] +#[serde(rename_all = "snake_case")] +pub enum ColdStartInfo { + #[default] + Unknown, + /// Compute was already running + Warm, + #[serde(rename = "pool_hit")] + #[label(rename = "pool_hit")] + /// Compute was not running but there was an available VM + VmPoolHit, + #[serde(rename = "pool_miss")] + #[label(rename = "pool_miss")] + /// Compute was not running and there were no VMs available + VmPoolMiss, - /// Values of labels for traffic metric. - // TODO: add more type safety (validate arity & positions). - pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] { - [ - direction, - &self.project_id, - &self.endpoint_id, - &self.branch_id, - ] + // not provided by control plane + /// Connection available from HTTP pool + HttpPoolHit, + /// Cached connection info + WarmCached, +} + +impl ColdStartInfo { + pub fn as_str(&self) -> &'static str { + match self { + ColdStartInfo::Unknown => "unknown", + ColdStartInfo::Warm => "warm", + ColdStartInfo::VmPoolHit => "pool_hit", + ColdStartInfo::VmPoolMiss => "pool_miss", + ColdStartInfo::HttpPoolHit => "http_pool_hit", + ColdStartInfo::WarmCached => "warm_cached", + } } } @@ -131,6 +316,7 @@ mod tests { "endpoint_id": "endpoint", "project_id": "project", "branch_id": "branch", + "cold_start_info": "unknown", }) } @@ -207,12 +393,17 @@ mod tests { "role_secret": "secret", }); let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; - // Empty `allowed_ips` field. let json = json!({ "role_secret": "secret", "allowed_ips": ["8.8.8.8"], }); let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; + let json = json!({ + "role_secret": "secret", + "allowed_ips": ["8.8.8.8"], + "project_id": "project", + }); + let _: GetRoleSecret = serde_json::from_str(&json.to_string())?; Ok(()) } diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index f0e084b679..c7a2d467c0 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -4,7 +4,7 @@ use crate::{ }; use anyhow::Context; use once_cell::sync::Lazy; -use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; +use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; use std::{convert::Infallible, future}; use tokio::net::{TcpListener, TcpStream}; @@ -13,16 +13,10 @@ use tracing::{error, info, info_span, Instrument}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); /// Give caller an opportunity to wait for the cloud's reply. -pub async fn with_waiter( +pub fn get_waiter( psql_session_id: impl Into, - action: impl FnOnce(Waiter<'static, ComputeReady>) -> R, -) -> Result -where - R: std::future::Future>, - E: From, -{ - let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - action(waiter).await +) -> Result, waiters::RegisterError> { + CPLANE_WAITERS.register(psql_session_id.into()) } pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> { @@ -77,7 +71,7 @@ async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { } /// A message received by `mgmt` when a compute node is ready. -pub type ComputeReady = Result; +pub type ComputeReady = DatabaseInfo; // TODO: replace with an http-based protocol. struct MgmtHandler; @@ -102,7 +96,7 @@ fn try_process_query(pgb: &mut PostgresBackendTCP, query: &str) -> Result<(), Qu let _enter = span.enter(); info!("got response: {:?}", resp.result); - match notify(resp.session_id, Ok(resp.result)) { + match notify(resp.session_id, resp.result) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 974384bd5b..915c2ee7a6 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -1,45 +1,47 @@ -#[cfg(feature = "testing")] +#[cfg(any(test, feature = "testing"))] pub mod mock; pub mod neon; use super::messages::MetricsAuxInfo; use crate::{ - auth::backend::ComputeUserInfo, - cache::{timed_lru, TimedLru}, + auth::{ + backend::{ComputeCredentialKeys, ComputeUserInfo}, + IpPattern, + }, + cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, compute, + config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, context::RequestMonitoring, - scram, + error::ReportableError, + intern::ProjectIdInt, + metrics::ApiLockMetrics, + rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}, + scram, EndpointCacheKey, }; -use async_trait::async_trait; use dashmap::DashMap; -use smol_str::SmolStr; -use std::{sync::Arc, time::Duration}; -use tokio::{ - sync::{OwnedSemaphorePermit, Semaphore}, - time::Instant, -}; +use std::{hash::Hash, sync::Arc, time::Duration}; +use tokio::time::Instant; use tracing::info; pub mod errors { use crate::{ - error::{io_error, UserFacingError}, - http, + console::messages::{self, ConsoleError}, + error::{io_error, ReportableError, UserFacingError}, proxy::retry::ShouldRetry, }; use thiserror::Error; + use super::ApiLockError; + /// A go-to error message which doesn't leak any detail. - const REQUEST_FAILED: &str = "Console request failed"; + pub const REQUEST_FAILED: &str = "Console request failed"; /// Common console API error. #[derive(Debug, Error)] pub enum ApiError { /// Error returned by the console itself. - #[error("{REQUEST_FAILED} with {}: {}", .status, .text)] - Console { - status: http::StatusCode, - text: Box, - }, + #[error("{REQUEST_FAILED} with {0}")] + Console(ConsoleError), /// Various IO errors like broken pipe or malformed payload. #[error("{REQUEST_FAILED}: {0}")] @@ -48,11 +50,11 @@ pub mod errors { impl ApiError { /// Returns HTTP status code if it's the reason for failure. - pub fn http_status_code(&self) -> Option { + pub fn get_reason(&self) -> messages::Reason { use ApiError::*; match self { - Console { status, .. } => Some(*status), - _ => None, + Console(e) => e.get_reason(), + _ => messages::Reason::Unknown, } } } @@ -62,52 +64,76 @@ pub mod errors { use ApiError::*; match self { // To minimize risks, only select errors are forwarded to users. - // Ask @neondatabase/control-plane for review before adding more. - Console { status, .. } => match *status { - http::StatusCode::NOT_FOUND => { - // Status 404: failed to get a project-related resource. - format!("{REQUEST_FAILED}: endpoint cannot be found") - } - http::StatusCode::NOT_ACCEPTABLE => { - // Status 406: endpoint is disabled (we don't allow connections). - format!("{REQUEST_FAILED}: endpoint is disabled") - } - http::StatusCode::LOCKED => { - // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded. - format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support") - } - _ => REQUEST_FAILED.to_owned(), - }, + Console(c) => c.get_user_facing_message(), _ => REQUEST_FAILED.to_owned(), } } } + impl ReportableError for ApiError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ApiError::Console(e) => { + use crate::error::ErrorKind::*; + match e.get_reason() { + crate::console::messages::Reason::RoleProtected => User, + crate::console::messages::Reason::ResourceNotFound => User, + crate::console::messages::Reason::ProjectNotFound => User, + crate::console::messages::Reason::EndpointNotFound => User, + crate::console::messages::Reason::BranchNotFound => User, + crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit, + crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => { + User + } + crate::console::messages::Reason::ActiveTimeQuotaExceeded => User, + crate::console::messages::Reason::ComputeTimeQuotaExceeded => User, + crate::console::messages::Reason::WrittenDataQuotaExceeded => User, + crate::console::messages::Reason::DataTransferQuotaExceeded => User, + crate::console::messages::Reason::LogicalSizeQuotaExceeded => User, + crate::console::messages::Reason::Unknown => match &e { + ConsoleError { + http_status_code: + http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, + .. + } => crate::error::ErrorKind::User, + ConsoleError { + http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY, + error, + .. + } if error.contains( + "compute time quota of non-primary branches is exceeded", + ) => + { + crate::error::ErrorKind::User + } + ConsoleError { + http_status_code: http::StatusCode::LOCKED, + error, + .. + } if error.contains("quota exceeded") + || error.contains("the limit for current plan reached") => + { + crate::error::ErrorKind::User + } + ConsoleError { + http_status_code: http::StatusCode::TOO_MANY_REQUESTS, + .. + } => crate::error::ErrorKind::ServiceRateLimit, + ConsoleError { .. } => crate::error::ErrorKind::ControlPlane, + }, + } + } + ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane, + } + } + } + impl ShouldRetry for ApiError { fn could_retry(&self) -> bool { match self { // retry some transport errors Self::Transport(io) => io.could_retry(), - // retry some temporary failures because the compute was in a bad state - // (bad request can be returned when the endpoint was in transition) - Self::Console { - status: http::StatusCode::BAD_REQUEST, - .. - } => true, - // locked can be returned when the endpoint was in transition - // or when quotas are exceeded. don't retry when quotas are exceeded - Self::Console { - status: http::StatusCode::LOCKED, - ref text, - } => { - // written data quota exceeded - // data transfer quota exceeded - // compute time quota exceeded - // logical size quota exceeded - !text.contains("quota exceeded") - && !text.contains("the limit for current plan reached") - } - _ => false, + Self::Console(e) => e.could_retry(), } } } @@ -152,6 +178,16 @@ pub mod errors { } } } + + impl ReportableError for GetAuthInfoError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + GetAuthInfoError::BadSecret => crate::error::ErrorKind::ControlPlane, + GetAuthInfoError::ApiError(_) => crate::error::ErrorKind::ControlPlane, + } + } + } + #[derive(Debug, Error)] pub enum WakeComputeError { #[error("Console responded with a malformed compute address: {0}")] @@ -160,8 +196,11 @@ pub mod errors { #[error(transparent)] ApiError(ApiError), - #[error("Timeout waiting to acquire wake compute lock")] - TimeoutError, + #[error("Too many connections attempts")] + TooManyConnections, + + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), } // This allows more useful interactions than `#[from]`. @@ -171,17 +210,6 @@ pub mod errors { } } - impl From for WakeComputeError { - fn from(_: tokio::sync::AcquireError) -> Self { - WakeComputeError::TimeoutError - } - } - impl From for WakeComputeError { - fn from(_: tokio::time::error::Elapsed) -> Self { - WakeComputeError::TimeoutError - } - } - impl UserFacingError for WakeComputeError { fn to_string_client(&self) -> String { use WakeComputeError::*; @@ -192,32 +220,31 @@ pub mod errors { // However, API might return a meaningful error. ApiError(e) => e.to_string_client(), - TimeoutError => "timeout while acquiring the compute resource lock".to_owned(), + TooManyConnections => self.to_string(), + + TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } + } + } + } + + impl ReportableError for WakeComputeError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane, + WakeComputeError::ApiError(e) => e.get_error_kind(), + WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit, + WakeComputeError::TooManyConnectionAttempts(e) => e.get_error_kind(), } } } } -/// Extra query params we'd like to pass to the console. -pub struct ConsoleReqExtra { - pub options: Vec<(String, String)>, -} - -impl ConsoleReqExtra { - // https://swagger.io/docs/specification/serialization/ DeepObject format - // paramName[prop1]=value1¶mName[prop2]=value2&.... - pub fn options_as_deep_object(&self) -> Vec<(String, String)> { - self.options - .iter() - .map(|(k, v)| (format!("options[{}]", k), v.to_string())) - .collect() - } -} - /// Auth secret which is managed by the cloud. -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq, Debug)] pub enum AuthSecret { - #[cfg(feature = "testing")] + #[cfg(any(test, feature = "testing"))] /// Md5 hash of user's password. Md5([u8; 16]), @@ -229,7 +256,9 @@ pub enum AuthSecret { pub struct AuthInfo { pub secret: Option, /// List of IP addresses allowed for the autorization. - pub allowed_ips: Vec, + pub allowed_ips: Vec, + /// Project ID. This is used for cache invalidation. + pub project_id: Option, } /// Info for establishing a connection to a compute node. @@ -248,123 +277,202 @@ pub struct NodeInfo { pub allow_self_signed_compute: bool, } -pub type NodeInfoCache = TimedLru, NodeInfo>; -pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>; -pub type AllowedIpsCache = TimedLru>>; -pub type RoleSecretCache = TimedLru<(SmolStr, SmolStr), Option>; -pub type CachedRoleSecret = timed_lru::Cached<&'static RoleSecretCache>; +impl NodeInfo { + pub async fn connect( + &self, + ctx: &mut RequestMonitoring, + timeout: Duration, + ) -> Result { + self.config + .connect( + ctx, + self.allow_self_signed_compute, + self.aux.clone(), + timeout, + ) + .await + } + pub fn reuse_settings(&mut self, other: Self) { + self.allow_self_signed_compute = other.allow_self_signed_compute; + self.config.reuse_password(other.config); + } + + pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) { + match keys { + ComputeCredentialKeys::Password(password) => self.config.password(password), + ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys), + }; + } +} + +pub type NodeInfoCache = TimedLru; +pub type CachedNodeInfo = Cached<&'static NodeInfoCache>; +pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; +pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; /// This will allocate per each call, but the http requests alone /// already require a few allocations, so it should be fine. -#[async_trait] -pub trait Api { +pub(crate) trait Api { /// Get the client's auth secret for authentication. + /// Returns option because user not found situation is special. + /// We still have to mock the scram to avoid leaking information that user doesn't exist. async fn get_role_secret( &self, ctx: &mut RequestMonitoring, - creds: &ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result; - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, ctx: &mut RequestMonitoring, - creds: &ComputeUserInfo, - ) -> Result>, errors::GetAuthInfoError>; + user_info: &ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, ctx: &mut RequestMonitoring, - extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result; } +#[non_exhaustive] +pub enum ConsoleBackend { + /// Current Cloud API (V2). + Console(neon::Api), + /// Local mock of Cloud API (V2). + #[cfg(any(test, feature = "testing"))] + Postgres(mock::Api), + /// Internal testing + #[cfg(test)] + Test(Box), +} + +impl Api for ConsoleBackend { + async fn get_role_secret( + &self, + ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, + ) -> Result { + use ConsoleBackend::*; + match self { + Console(api) => api.get_role_secret(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Postgres(api) => api.get_role_secret(ctx, user_info).await, + #[cfg(test)] + Test(_) => unreachable!("this function should never be called in the test backend"), + } + } + + async fn get_allowed_ips_and_secret( + &self, + ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { + use ConsoleBackend::*; + match self { + Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, + #[cfg(test)] + Test(api) => api.get_allowed_ips_and_secret(), + } + } + + async fn wake_compute( + &self, + ctx: &mut RequestMonitoring, + user_info: &ComputeUserInfo, + ) -> Result { + use ConsoleBackend::*; + + match self { + Console(api) => api.wake_compute(ctx, user_info).await, + #[cfg(any(test, feature = "testing"))] + Postgres(api) => api.wake_compute(ctx, user_info).await, + #[cfg(test)] + Test(api) => api.wake_compute(), + } + } +} + /// Various caches for [`console`](super). pub struct ApiCaches { /// Cache for the `wake_compute` API method. pub node_info: NodeInfoCache, - /// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead. - pub allowed_ips: AllowedIpsCache, - /// Cache for the `get_role_secret`. TODO(anna): use notifications listener instead. - pub role_secret: RoleSecretCache, + /// Cache which stores project_id -> endpoint_ids mapping. + pub project_info: Arc, + /// List of all valid endpoints. + pub endpoints_cache: Arc, +} + +impl ApiCaches { + pub fn new( + wake_compute_cache_config: CacheOptions, + project_info_cache_config: ProjectInfoCacheOptions, + endpoint_cache_config: EndpointCacheConfig, + ) -> Self { + Self { + node_info: NodeInfoCache::new( + "node_info_cache", + wake_compute_cache_config.size, + wake_compute_cache_config.ttl, + true, + ), + project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)), + endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)), + } + } } /// Various caches for [`console`](super). -pub struct ApiLocks { +pub struct ApiLocks { name: &'static str, - node_locks: DashMap, Arc>, - permits: usize, + node_locks: DashMap>, + config: RateLimiterConfig, timeout: Duration, - registered: prometheus::IntCounter, - unregistered: prometheus::IntCounter, - reclamation_lag: prometheus::Histogram, - lock_acquire_lag: prometheus::Histogram, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, } -impl ApiLocks { +#[derive(Debug, thiserror::Error)] +pub enum ApiLockError { + #[error("timeout acquiring resource permit")] + TimeoutError(#[from] tokio::time::error::Elapsed), +} + +impl ReportableError for ApiLockError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit, + } + } +} + +impl ApiLocks { pub fn new( name: &'static str, - permits: usize, + config: RateLimiterConfig, shards: usize, timeout: Duration, + epoch: std::time::Duration, + metrics: &'static ApiLockMetrics, ) -> prometheus::Result { - let registered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_registered", - "Number of semaphores registered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(registered.clone()))?; - let unregistered = prometheus::IntCounter::with_opts( - prometheus::Opts::new( - "semaphores_unregistered", - "Number of semaphores unregistered in this api lock", - ) - .namespace(name), - )?; - prometheus::register(Box::new(unregistered.clone()))?; - let reclamation_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "reclamation_lag_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 1us -> 65ms - // benchmarks on my mac indicate it's usually in the range of 256us and 512us - .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?), - )?; - prometheus::register(Box::new(reclamation_lag.clone()))?; - let lock_acquire_lag = prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "semaphore_acquire_seconds", - "Time it takes to reclaim unused semaphores in the api lock", - ) - .namespace(name) - // 0.1ms -> 6s - .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?), - )?; - prometheus::register(Box::new(lock_acquire_lag.clone()))?; - Ok(Self { name, node_locks: DashMap::with_shard_amount(shards), - permits, + config, timeout, - lock_acquire_lag, - registered, - unregistered, - reclamation_lag, + epoch, + metrics, }) } - pub async fn get_wake_compute_permit( - &self, - key: &Arc, - ) -> Result { - if self.permits == 0 { - return Ok(WakeComputePermit { permit: None }); + pub async fn get_permit(&self, key: &K) -> Result { + if self.config.initial_limit == 0 { + return Ok(WakeComputePermit { + permit: Token::disabled(), + }); } let now = Instant::now(); let semaphore = { @@ -375,28 +483,27 @@ impl ApiLocks { self.node_locks .entry(key.clone()) .or_insert_with(|| { - self.registered.inc(); - Arc::new(Semaphore::new(self.permits)) + self.metrics.semaphores_registered.inc(); + DynamicLimiter::new(self.config) }) .clone() } }; - let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await; + let permit = semaphore.acquire_timeout(self.timeout).await; - self.lock_acquire_lag - .observe((Instant::now() - now).as_secs_f64()); - - Ok(WakeComputePermit { - permit: Some(permit??), - }) + self.metrics + .semaphore_acquire_seconds + .observe(now.elapsed().as_secs_f64()); + info!("acquired permit {:?}", now.elapsed().as_secs_f64()); + Ok(WakeComputePermit { permit: permit? }) } - pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) { - if self.permits == 0 { + pub async fn garbage_collect_worker(&self) { + if self.config.initial_limit == 0 { return; } - - let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32); + let mut interval = + tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32); loop { for (i, shard) in self.node_locks.shards().iter().enumerate() { interval.tick().await; @@ -409,25 +516,34 @@ impl ApiLocks { "performing epoch reclamation on api lock" ); let mut lock = shard.write(); - let timer = self.reclamation_lag.start_timer(); + let timer = self.metrics.reclamation_lag_seconds.start_timer(); let count = lock .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1) .count(); drop(lock); - self.unregistered.inc_by(count as u64); - timer.observe_duration() + self.metrics.semaphores_unregistered.inc_by(count as u64); + timer.observe(); } } } } pub struct WakeComputePermit { - // None if the lock is disabled - permit: Option, + permit: Token, } impl WakeComputePermit { pub fn should_check_cache(&self) -> bool { - self.permit.is_some() + !self.permit.is_disabled() + } + pub fn release(self, outcome: Outcome) { + self.permit.release(outcome) + } + pub fn release_result(self, res: Result) -> Result { + match res { + Ok(_) => self.release(Outcome::Success), + Err(_) => self.release(Outcome::Overload), + } + res } } diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index fa61ec3221..cfe491f2aa 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -1,15 +1,21 @@ //! Mock console backend which relies on a user-provided postgres instance. -use std::sync::Arc; - use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, - AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo, + AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; +use crate::context::RequestMonitoring; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; -use crate::{console::provider::CachedRoleSecret, context::RequestMonitoring}; -use async_trait::async_trait; +use crate::{auth::IpPattern, cache::Cached}; +use crate::{ + console::{ + messages::MetricsAuxInfo, + provider::{CachedAllowedIps, CachedRoleSecret}, + }, + BranchId, EndpointId, ProjectId, +}; use futures::TryFutureExt; +use std::{str::FromStr, sync::Arc}; use thiserror::Error; use tokio_postgres::{config::SslMode, Client}; use tracing::{error, info, info_span, warn, Instrument}; @@ -48,7 +54,7 @@ impl Api { async fn do_get_auth_info( &self, - creds: &ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result { let (secret, allowed_ips) = async { // Perhaps we could persist this connection, but then we'd have to @@ -61,7 +67,7 @@ impl Api { let secret = match get_execute_postgres_query( &client, "select rolpassword from pg_catalog.pg_authid where rolname = $1", - &[&&*creds.inner.user], + &[&&*user_info.user], "rolpassword", ) .await? @@ -72,21 +78,23 @@ impl Api { secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5)) } None => { - warn!("user '{}' does not exist", creds.inner.user); + warn!("user '{}' does not exist", user_info.user); None } }; let allowed_ips = match get_execute_postgres_query( &client, "select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1", - &[&creds.endpoint.as_str()], + &[&user_info.endpoint.as_str()], "allowed_ips", ) .await? { Some(s) => { info!("got allowed_ips: {s}"); - s.split(',').map(String::from).collect() + s.split(',') + .map(|s| IpPattern::from_str(s).unwrap()) + .collect() } None => vec![], }; @@ -99,6 +107,7 @@ impl Api { Ok(AuthInfo { secret, allowed_ips, + project_id: None, }) } @@ -111,7 +120,12 @@ impl Api { let node = NodeInfo { config, - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; @@ -140,37 +154,38 @@ async fn get_execute_postgres_query( Ok(Some(entry)) } -#[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, _ctx: &mut RequestMonitoring, - creds: &ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result { Ok(CachedRoleSecret::new_uncached( - self.do_get_auth_info(creds).await?.secret, + self.do_get_auth_info(user_info).await?.secret, )) } - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, _ctx: &mut RequestMonitoring, - creds: &ComputeUserInfo, - ) -> Result>, GetAuthInfoError> { - Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips)) + user_info: &ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + Ok(( + Cached::new_uncached(Arc::new( + self.do_get_auth_info(user_info).await?.allowed_ips, + )), + None, + )) } #[tracing::instrument(skip_all)] async fn wake_compute( &self, _ctx: &mut RequestMonitoring, - _extra: &ConsoleReqExtra, - _creds: &ComputeUserInfo, + _user_info: &ComputeUserInfo, ) -> Result { - self.do_wake_compute() - .map_ok(CachedNodeInfo::new_uncached) - .await + self.do_wake_compute().map_ok(Cached::new_uncached).await } } diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 7867a1e933..41bd2f4956 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -3,27 +3,30 @@ use super::{ super::messages::{ConsoleError, GetRoleSecret, WakeCompute}, errors::{ApiError, GetAuthInfoError, WakeComputeError}, - ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra, + ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, }; -use crate::{auth::backend::ComputeUserInfo, compute, http, scram}; use crate::{ - context::RequestMonitoring, - metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}, + auth::backend::ComputeUserInfo, + compute, + console::messages::ColdStartInfo, + http, + metrics::{CacheOutcome, Metrics}, + rate_limiter::EndpointRateLimiter, + scram, EndpointCacheKey, }; -use async_trait::async_trait; +use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; -use itertools::Itertools; use std::sync::Arc; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{error, info, info_span, warn, Instrument}; -#[derive(Clone)] pub struct Api { endpoint: http::Endpoint, - caches: &'static ApiCaches, - locks: &'static ApiLocks, + pub caches: &'static ApiCaches, + pub locks: &'static ApiLocks, + pub wake_compute_endpoint_rate_limiter: Arc, jwt: String, } @@ -32,7 +35,8 @@ impl Api { pub fn new( endpoint: http::Endpoint, caches: &'static ApiCaches, - locks: &'static ApiLocks, + locks: &'static ApiLocks, + wake_compute_endpoint_rate_limiter: Arc, ) -> Self { let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") { Ok(v) => v, @@ -42,6 +46,7 @@ impl Api { endpoint, caches, locks, + wake_compute_endpoint_rate_limiter, jwt, } } @@ -53,9 +58,18 @@ impl Api { async fn do_get_auth_info( &self, ctx: &mut RequestMonitoring, - creds: &ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result { - let request_id = uuid::Uuid::new_v4().to_string(); + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint.normalize()) + .await + { + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } + let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { let request = self @@ -66,37 +80,47 @@ impl Api { .query(&[("session_id", ctx.session_id)]) .query(&[ ("application_name", application_name.as_str()), - ("project", creds.endpoint.as_str()), - ("role", creds.inner.user.as_str()), + ("project", user_info.endpoint.as_str()), + ("role", user_info.user.as_str()), ]) .build()?; info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; + drop(pause); info!(duration = ?start.elapsed(), "received http response"); let body = match parse_body::(response).await { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. - Err(e) => match e.http_status_code() { - Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()), - _otherwise => return Err(e.into()), - }, + // TODO(anna): retry + Err(e) => { + if e.get_reason().is_not_found() { + return Ok(AuthInfo::default()); + } else { + return Err(e.into()); + } + } }; - let secret = scram::ServerSecret::parse(&body.role_secret) - .map(AuthSecret::Scram) - .ok_or(GetAuthInfoError::BadSecret)?; - let allowed_ips = body - .allowed_ips - .into_iter() - .flatten() - .map(String::from) - .collect_vec(); - ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64); + let secret = if body.role_secret.is_empty() { + None + } else { + let secret = scram::ServerSecret::parse(&body.role_secret) + .map(AuthSecret::Scram) + .ok_or(GetAuthInfoError::BadSecret)?; + Some(secret) + }; + let allowed_ips = body.allowed_ips.unwrap_or_default(); + Metrics::get() + .proxy + .allowed_ips_number + .observe(allowed_ips.len() as f64); Ok(AuthInfo { - secret: Some(secret), + secret, allowed_ips, + project_id: body.project_id, }) } .map_err(crate::error::log_error) @@ -107,10 +131,9 @@ impl Api { async fn do_wake_compute( &self, ctx: &mut RequestMonitoring, - extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result { - let request_id = uuid::Uuid::new_v4().to_string(); + let request_id = ctx.session_id.to_string(); let application_name = ctx.console_application_name(); async { let mut request_builder = self @@ -121,19 +144,21 @@ impl Api { .query(&[("session_id", ctx.session_id)]) .query(&[ ("application_name", application_name.as_str()), - ("project", creds.endpoint.as_str()), + ("project", user_info.endpoint.as_str()), ]); - request_builder = if extra.options.is_empty() { - request_builder - } else { - request_builder.query(&extra.options_as_deep_object()) - }; + let options = user_info.options.to_deep_object(); + if !options.is_empty() { + request_builder = request_builder.query(&options); + } + let request = request_builder.build()?; info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane); let response = self.endpoint.execute(request).await?; + drop(pause); info!(duration = ?start.elapsed(), "received http response"); let body = parse_body::(response).await?; @@ -163,88 +188,132 @@ impl Api { } } -#[async_trait] impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_role_secret( &self, ctx: &mut RequestMonitoring, - creds: &ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result { - let ep = creds.endpoint.clone(); - let user = creds.inner.user.clone(); - if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) { + let normalized_ep = &user_info.endpoint.normalize(); + let user = &user_info.user; + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { return Ok(role_secret); } - let auth_info = self.do_get_auth_info(ctx, creds).await?; - let (_, secret) = self - .caches - .role_secret - .insert((ep.clone(), user), auth_info.secret.clone()); - self.caches - .allowed_ips - .insert(ep, Arc::new(auth_info.allowed_ips)); - Ok(secret) + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + Arc::new(auth_info.allowed_ips), + ); + ctx.set_project_id(project_id); + } + // When we just got a secret, we don't need to invalidate it. + Ok(Cached::new_uncached(auth_info.secret)) } - async fn get_allowed_ips( + async fn get_allowed_ips_and_secret( &self, ctx: &mut RequestMonitoring, - creds: &ComputeUserInfo, - ) -> Result>, GetAuthInfoError> { - if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) { - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["hit"]) - .inc(); - return Ok(Arc::new(allowed_ips.to_vec())); + user_info: &ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Hit); + return Ok((allowed_ips, None)); } - ALLOWED_IPS_BY_CACHE_OUTCOME - .with_label_values(&["miss"]) - .inc(); - let auth_info = self.do_get_auth_info(ctx, creds).await?; + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Miss); + let auth_info = self.do_get_auth_info(ctx, user_info).await?; let allowed_ips = Arc::new(auth_info.allowed_ips); - let ep = creds.endpoint.clone(); - let user = creds.inner.user.clone(); - self.caches - .role_secret - .insert((ep.clone(), user), auth_info.secret); - self.caches.allowed_ips.insert(ep, allowed_ips.clone()); - Ok(allowed_ips) + let user = &user_info.user; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); + ctx.set_project_id(project_id); + } + Ok(( + Cached::new_uncached(allowed_ips), + Some(Cached::new_uncached(auth_info.secret)), + )) } #[tracing::instrument(skip_all)] async fn wake_compute( &self, ctx: &mut RequestMonitoring, - extra: &ConsoleReqExtra, - creds: &ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result { - let key: &str = &creds.inner.cache_key; + let key = user_info.endpoint_cache_key(); // Every time we do a wakeup http request, the compute node will stay up // for some time (highly depends on the console's scale-to-zero policy); // The connection info remains the same during that period of time, // which means that we might cache it to reduce the load and latency. - if let Some(cached) = self.caches.node_info.get(key) { - info!(key = key, "found cached compute node info"); + if let Some(cached) = self.caches.node_info.get(&key) { + info!(key = &*key, "found cached compute node info"); + ctx.set_project(cached.aux.clone()); return Ok(cached); } - let key: Arc = key.into(); - - let permit = self.locks.get_wake_compute_permit(&key).await?; + let permit = self.locks.get_permit(&key).await?; // after getting back a permit - it's possible the cache was filled // double check if permit.should_check_cache() { if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); + ctx.set_project(cached.aux.clone()); return Ok(cached); } } - let node = self.do_wake_compute(ctx, extra, creds).await?; - let (_, cached) = self.caches.node_info.insert(key.clone(), node); + // check rate limit + if !self + .wake_compute_endpoint_rate_limiter + .check(user_info.endpoint.normalize_intern(), 1) + { + info!(key = &*key, "found cached compute node info"); + return Err(WakeComputeError::TooManyConnections); + } + + let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?; + ctx.set_project(node.aux.clone()); + let cold_start_info = node.aux.cold_start_info; + info!("woken up a compute node"); + + // store the cached node as 'warm' + node.aux.cold_start_info = ColdStartInfo::WarmCached; + let (_, mut cached) = self.caches.node_info.insert(key.clone(), node); + cached.aux.cold_start_info = cold_start_info; + info!(key = &*key, "created a cache entry for compute node info"); Ok(cached) @@ -261,19 +330,24 @@ async fn parse_body serde::Deserialize<'a>>( info!("request succeeded, processing the body"); return Ok(response.json().await?); } + let s = response.bytes().await?; + // Log plaintext to be able to detect, whether there are some cases not covered by the error struct. + info!("response_error plaintext: {:?}", s); // Don't throw an error here because it's not as important // as the fact that the request itself has failed. - let body = response.json().await.unwrap_or_else(|e| { + let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| { warn!("failed to parse error body: {e}"); ConsoleError { error: "reason unclear (malformed error message)".into(), + http_status_code: status, + status: None, } }); + body.http_status_code = status; - let text = body.error; - error!("console responded with an error ({status}): {text}"); - Err(ApiError::Console { status, text }) + error!("console responded with an error ({status}): {body:?}"); + Err(ApiError::Console(body)) } fn parse_host_port(input: &str) -> Option<(&str, u16)> { diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 47449cf59a..ff79ba8275 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -2,18 +2,28 @@ use chrono::Utc; use once_cell::sync::OnceCell; +use pq_proto::StartupMessageParams; use smol_str::SmolStr; use std::net::IpAddr; use tokio::sync::mpsc; +use tracing::{field::display, info, info_span, Span}; use uuid::Uuid; -use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer}; +use crate::{ + console::messages::{ColdStartInfo, MetricsAuxInfo}, + error::ErrorKind, + intern::{BranchIdInt, ProjectIdInt}, + metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol}, + DbName, EndpointId, RoleName, +}; + +use self::parquet::RequestData; pub mod parquet; -static LOG_CHAN: OnceCell> = OnceCell::new(); +pub static LOG_CHAN: OnceCell> = OnceCell::new(); +pub static LOG_CHAN_DISCONNECT: OnceCell> = OnceCell::new(); -#[derive(Clone)] /// Context data for a single request to connect to a database. /// /// This data should **not** be used for connection logic, only for observability and limiting purposes. @@ -21,53 +31,91 @@ static LOG_CHAN: OnceCell> = OnceCe pub struct RequestMonitoring { pub peer_addr: IpAddr, pub session_id: Uuid, - pub protocol: &'static str, + pub protocol: Protocol, first_packet: chrono::DateTime, region: &'static str, + pub span: Span, // filled in as they are discovered - project: Option, - branch: Option, - endpoint_id: Option, - user: Option, + project: Option, + branch: Option, + endpoint_id: Option, + dbname: Option, + user: Option, application: Option, error_kind: Option, + pub(crate) auth_method: Option, + success: bool, + pub(crate) cold_start_info: ColdStartInfo, + pg_options: Option, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. - sender: Option>, + sender: Option>, + // This sender is only used to log the length of session in case of success. + disconnect_sender: Option>, pub latency_timer: LatencyTimer, + // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane. + rejected: Option, + disconnect_timestamp: Option>, +} + +#[derive(Clone, Debug)] +pub enum AuthMethod { + // aka link aka passwordless + Web, + ScramSha256, + ScramSha256Plus, + Cleartext, } impl RequestMonitoring { pub fn new( session_id: Uuid, peer_addr: IpAddr, - protocol: &'static str, + protocol: Protocol, region: &'static str, ) -> Self { + let span = info_span!( + "connect_request", + %protocol, + ?session_id, + %peer_addr, + ep = tracing::field::Empty, + role = tracing::field::Empty, + ); + Self { peer_addr, session_id, protocol, first_packet: Utc::now(), region, + span, project: None, branch: None, endpoint_id: None, + dbname: None, user: None, application: None, error_kind: None, + auth_method: None, + success: false, + rejected: None, + cold_start_info: ColdStartInfo::Unknown, + pg_options: None, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), + disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), + disconnect_timestamp: None, } } #[cfg(test)] pub fn test() -> Self { - RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test") + RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test") } pub fn console_application_name(&self) -> String { @@ -78,33 +126,142 @@ impl RequestMonitoring { ) } + pub fn set_rejected(&mut self, rejected: bool) { + self.rejected = Some(rejected); + } + + pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { + self.cold_start_info = info; + self.latency_timer.cold_start_info(info); + } + + pub fn set_db_options(&mut self, options: StartupMessageParams) { + self.set_application(options.get("application_name").map(SmolStr::from)); + if let Some(user) = options.get("user") { + self.set_user(user.into()); + } + if let Some(dbname) = options.get("database") { + self.set_dbname(dbname.into()); + } + + self.pg_options = Some(options); + } + pub fn set_project(&mut self, x: MetricsAuxInfo) { + if self.endpoint_id.is_none() { + self.set_endpoint_id(x.endpoint_id.as_str().into()) + } self.branch = Some(x.branch_id); - self.endpoint_id = Some(x.endpoint_id); self.project = Some(x.project_id); + self.set_cold_start_info(x.cold_start_info); } - pub fn set_endpoint_id(&mut self, endpoint_id: Option) { - self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone()); + pub fn set_project_id(&mut self, project_id: ProjectIdInt) { + self.project = Some(project_id); } - pub fn set_application(&mut self, app: Option) { - self.application = app.or_else(|| self.application.clone()); + pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { + if self.endpoint_id.is_none() { + self.span.record("ep", display(&endpoint_id)); + let metric = &Metrics::get().proxy.connecting_endpoints; + let label = metric.with_labels(self.protocol); + metric.get_metric(label).measure(&endpoint_id); + self.endpoint_id = Some(endpoint_id); + } } - pub fn set_user(&mut self, user: SmolStr) { + fn set_application(&mut self, app: Option) { + if let Some(app) = app { + self.application = Some(app); + } + } + + pub fn set_dbname(&mut self, dbname: DbName) { + self.dbname = Some(dbname); + } + + pub fn set_user(&mut self, user: RoleName) { + self.span.record("role", display(&user)); self.user = Some(user); } - pub fn log(&mut self) { + pub fn set_auth_method(&mut self, auth_method: AuthMethod) { + self.auth_method = Some(auth_method); + } + + pub fn has_private_peer_addr(&self) -> bool { + match self.peer_addr { + IpAddr::V4(ip) => ip.is_private(), + _ => false, + } + } + + pub fn set_error_kind(&mut self, kind: ErrorKind) { + // Do not record errors from the private address to metrics. + if !self.has_private_peer_addr() { + Metrics::get().proxy.errors_total.inc(kind); + } + if let Some(ep) = &self.endpoint_id { + let metric = &Metrics::get().proxy.endpoints_affected_by_errors; + let label = metric.with_labels(kind); + metric.get_metric(label).measure(ep); + } + self.error_kind = Some(kind); + } + + pub fn set_success(&mut self) { + self.success = true; + } + + pub fn log_connect(&mut self) { + let outcome = if self.success { + ConnectOutcome::Success + } else { + ConnectOutcome::Failed + }; + if let Some(rejected) = self.rejected { + let ep = self + .endpoint_id + .as_ref() + .map(|x| x.as_str()) + .unwrap_or_default(); + // This makes sense only if cache is disabled + info!( + ?outcome, + ?rejected, + ?ep, + "check endpoint is valid with outcome" + ); + Metrics::get() + .proxy + .invalid_endpoints_total + .inc(InvalidEndpointsGroup { + protocol: self.protocol, + rejected: rejected.into(), + outcome, + }); + } if let Some(tx) = self.sender.take() { - let _: Result<(), _> = tx.send(self.clone()); + let _: Result<(), _> = tx.send(RequestData::from(&*self)); + } + } + + fn log_disconnect(&mut self) { + // If we are here, it's guaranteed that the user successfully connected to the endpoint. + // Here we log the length of the session. + self.disconnect_timestamp = Some(Utc::now()); + if let Some(tx) = self.disconnect_sender.take() { + let _: Result<(), _> = tx.send(RequestData::from(&*self)); } } } impl Drop for RequestMonitoring { fn drop(&mut self) { - self.log() + if self.sender.is_some() { + self.log_connect(); + } else { + self.log_disconnect(); + } } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index ca4eff5ddf..1355b7e1d8 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -1,7 +1,8 @@ -use std::sync::Arc; +use std::{sync::Arc, time::SystemTime}; use anyhow::Context; -use bytes::BytesMut; +use bytes::{buf::Writer, BufMut, BytesMut}; +use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; use parquet::{ basic::Compression, @@ -12,12 +13,19 @@ use parquet::{ }, record::RecordWriter, }; -use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig}; +use pq_proto::StartupMessageParams; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; +use serde::ser::SerializeMap; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; +use crate::{ + config::{remote_storage_from_toml, OptRemoteStorageConfig}, + context::LOG_CHAN_DISCONNECT, +}; + use super::{RequestMonitoring, LOG_CHAN}; #[derive(clap::Args, Clone, Debug)] @@ -28,6 +36,9 @@ pub struct ParquetUploadArgs { #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] parquet_upload_remote_storage: OptRemoteStorageConfig, + #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] + parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig, + /// How many rows to include in a row group #[clap(long, default_value_t = 8192)] parquet_upload_row_group_size: usize, @@ -49,21 +60,13 @@ pub struct ParquetUploadArgs { parquet_upload_compression: Compression, } -/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get -/// runtime type errors from the value parser we use. -type OptRemoteStorageConfig = Option; - -fn remote_storage_from_toml(s: &str) -> anyhow::Result { - RemoteStorageConfig::from_toml(&s.parse()?) -} - // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a upload fails, we log it at info-level, and retry. // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN // level instead, as repeated failures can mean a more serious problem. If it // fails more than FAILED_UPLOAD_RETRIES times, we give up -pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; -pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; +pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // the parquet crate leaves a lot to be desired... // what follows is an attempt to write parquet files with minimal allocs. @@ -73,7 +76,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // * after each rowgroup write, we check the length of the file and upload to s3 if large enough #[derive(parquet_derive::ParquetRecordWriter)] -struct RequestData { +pub struct RequestData { region: &'static str, protocol: &'static str, /// Must be UTC. The derive macro doesn't like the timezones @@ -83,13 +86,43 @@ struct RequestData { username: Option, application_name: Option, endpoint_id: Option, + database: Option, project: Option, branch: Option, + pg_options: Option, + auth_method: Option<&'static str>, error: Option<&'static str>, + /// Success is counted if we form a HTTP response with sql rows inside + /// Or if we make it to proxy_pass + success: bool, + /// Indicates if the cplane started the new compute node for this request. + cold_start_info: &'static str, + /// Tracks time from session start (HTTP request/libpq TCP handshake) + /// Through to success/failure + duration_us: u64, + /// If the session was successful after the disconnect, will be created one more event with filled `disconnect_timestamp`. + disconnect_timestamp: Option, } -impl From for RequestData { - fn from(value: RequestMonitoring) -> Self { +struct Options<'a> { + options: &'a StartupMessageParams, +} + +impl<'a> serde::Serialize for Options<'a> { + fn serialize(&self, s: S) -> Result + where + S: serde::Serializer, + { + let mut state = s.serialize_map(None)?; + for (k, v) in self.options.iter() { + state.serialize_entry(k, v)?; + } + state.end() + } +} + +impl From<&RequestMonitoring> for RequestData { + fn from(value: &RequestMonitoring) -> Self { Self { session_id: value.session_id, peer_addr: value.peer_addr.to_string(), @@ -97,11 +130,29 @@ impl From for RequestData { username: value.user.as_deref().map(String::from), application_name: value.application.as_deref().map(String::from), endpoint_id: value.endpoint_id.as_deref().map(String::from), + database: value.dbname.as_deref().map(String::from), project: value.project.as_deref().map(String::from), branch: value.branch.as_deref().map(String::from), - protocol: value.protocol, + pg_options: value + .pg_options + .as_ref() + .and_then(|options| serde_json::to_string(&Options { options }).ok()), + auth_method: value.auth_method.as_ref().map(|x| match x { + super::AuthMethod::Web => "web", + super::AuthMethod::ScramSha256 => "scram_sha_256", + super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", + super::AuthMethod::Cleartext => "cleartext", + }), + protocol: value.protocol.as_str(), region: value.region, - error: value.error_kind.as_ref().map(|e| e.to_str()), + error: value.error_kind.as_ref().map(|e| e.to_metric_label()), + success: value.success, + cold_start_info: value.cold_start_info.as_str(), + duration_us: SystemTime::from(value.first_packet) + .elapsed() + .unwrap_or_default() + .as_micros() as u64, // 584 millenia... good enough + disconnect_timestamp: value.disconnect_timestamp.map(|x| x.naive_utc()), } } } @@ -123,8 +174,9 @@ pub async fn worker( LOG_CHAN.set(tx.downgrade()).unwrap(); // setup row stream that will close on cancellation + let cancellation_token2 = cancellation_token.clone(); tokio::spawn(async move { - cancellation_token.cancelled().await; + cancellation_token2.cancelled().await; // dropping this sender will cause the channel to close only once // all the remaining inflight requests have been completed. drop(tx); @@ -149,9 +201,38 @@ pub async fn worker( test_remote_failures: 0, }; - worker_inner(storage, rx, parquet_config).await + // TODO(anna): consider moving this to a separate function. + if let Some(disconnect_events_storage_config) = + config.parquet_upload_disconnect_events_remote_storage + { + let (tx_disconnect, mut rx_disconnect) = mpsc::unbounded_channel(); + LOG_CHAN_DISCONNECT.set(tx_disconnect.downgrade()).unwrap(); + + // setup row stream that will close on cancellation + tokio::spawn(async move { + cancellation_token.cancelled().await; + // dropping this sender will cause the channel to close only once + // all the remaining inflight requests have been completed. + drop(tx_disconnect); + }); + let rx_disconnect = futures::stream::poll_fn(move |cx| rx_disconnect.poll_recv(cx)); + let rx_disconnect = rx_disconnect.map(RequestData::from); + + let storage_disconnect = + GenericRemoteStorage::from_config(&disconnect_events_storage_config) + .context("remote storage for disconnect events init")?; + let parquet_config_disconnect = parquet_config.clone(); + tokio::try_join!( + worker_inner(storage, rx, parquet_config), + worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) + ) + .map(|_| ()) + } else { + worker_inner(storage, rx, parquet_config).await + } } +#[derive(Clone, Debug)] struct ParquetConfig { propeties: WriterPropertiesPtr, rows_per_group: usize, @@ -180,8 +261,9 @@ async fn worker_inner( let mut rows = Vec::with_capacity(config.rows_per_group); let schema = rows.as_slice().schema()?; - let file = BytesWriter::default(); - let mut w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?; + let buffer = BytesMut::new(); + let w = buffer.writer(); + let mut w = SerializedFileWriter::new(w, schema.clone(), config.propeties.clone())?; let mut last_upload = time::Instant::now(); @@ -209,20 +291,23 @@ async fn worker_inner( } if !w.flushed_row_groups().is_empty() { - let _: BytesWriter = upload_parquet(w, len, &storage).await?; + let _: Writer = upload_parquet(w, len, &storage).await?; } Ok(()) } -async fn flush_rows( +async fn flush_rows( rows: Vec, - mut w: SerializedFileWriter, + mut w: SerializedFileWriter, ) -> anyhow::Result<( Vec, - SerializedFileWriter, + SerializedFileWriter, RowGroupMetaDataPtr, -)> { +)> +where + W: std::io::Write + Send + 'static, +{ let span = Span::current(); let (mut rows, w, rg_meta) = tokio::task::spawn_blocking(move || { let _enter = span.enter(); @@ -246,10 +331,10 @@ async fn flush_rows( } async fn upload_parquet( - w: SerializedFileWriter, + mut w: SerializedFileWriter>, len: i64, storage: &GenericRemoteStorage, -) -> anyhow::Result { +) -> anyhow::Result> { let len_uncompressed = w .flushed_row_groups() .iter() @@ -258,15 +343,26 @@ async fn upload_parquet( // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry. // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253 - let (mut file, metadata) = tokio::task::spawn_blocking(move || w.finish()) + let (mut buffer, metadata) = + tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> { + let metadata = w.finish()?; + let buffer = std::mem::take(w.inner_mut().get_mut()); + Ok((buffer, metadata)) + }) .await .unwrap()?; - let data = file.buf.split().freeze(); + let data = buffer.split().freeze(); let compression = len as f64 / len_uncompressed as f64; let size = data.len(); - let id = uuid::Uuid::now_v7(); + let now = chrono::Utc::now(); + let id = uuid::Uuid::new_v7(uuid::Timestamp::from_unix( + uuid::NoContext, + // we won't be running this in 1970. this cast is ok + now.timestamp() as u64, + now.timestamp_subsec_nanos(), + )); info!( %id, @@ -274,40 +370,40 @@ async fn upload_parquet( size, compression, "uploading request parquet file" ); - let path = RemotePath::from_string(&format!("requests_{id}.parquet"))?; - backoff::retry( + let year = now.year(); + let month = now.month(); + let day = now.day(); + let hour = now.hour(); + // segment files by time for S3 performance + let path = RemotePath::from_string(&format!( + "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet" + ))?; + let cancel = CancellationToken::new(); + let maybe_err = backoff::retry( || async { let stream = futures::stream::once(futures::future::ready(Ok(data.clone()))); - storage.upload(stream, data.len(), &path, None).await + storage + .upload(stream, data.len(), &path, None, &cancel) + .await }, - |_e| false, + TimeoutOrCancel::caused_by_cancel, FAILED_UPLOAD_WARN_THRESHOLD, FAILED_UPLOAD_MAX_RETRIES, "request_data_upload", // we don't want cancellation to interrupt here, so we make a dummy cancel token - backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")), + &cancel, ) .await - .context("request_data_upload")?; + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("request_data_upload") + .err(); - Ok(file) -} - -// why doesn't BytesMut impl io::Write? -#[derive(Default)] -struct BytesWriter { - buf: BytesMut, -} - -impl std::io::Write for BytesWriter { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.buf.extend_from_slice(buf); - Ok(buf.len()) + if let Some(err) = maybe_err { + tracing::warn!(%id, %err, "failed to upload request data"); } - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } + Ok(buffer.writer()) } #[cfg(test)] @@ -332,6 +428,7 @@ mod tests { DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; use tokio::{sync::mpsc, time}; + use walkdir::WalkDir; use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData}; @@ -388,7 +485,9 @@ mod tests { ) .unwrap(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, - }) + upload_storage_class: None, + }), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }) ); assert_eq!(parquet_upload.parquet_upload_row_group_size, 100); @@ -408,18 +507,26 @@ mod tests { RequestData { session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(), peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(), - timestamp: chrono::NaiveDateTime::from_timestamp_millis( + timestamp: chrono::DateTime::from_timestamp_millis( rng.gen_range(1703862754..1803862754), ) - .unwrap(), + .unwrap() + .naive_utc(), application_name: Some("test".to_owned()), username: Some(hex::encode(rng.gen::<[u8; 4]>())), endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())), + database: Some(hex::encode(rng.gen::<[u8; 16]>())), project: Some(hex::encode(rng.gen::<[u8; 16]>())), branch: Some(hex::encode(rng.gen::<[u8; 16]>())), + pg_options: None, + auth_method: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], region: "us-east-1", error: None, + success: rng.gen(), + cold_start_info: "no", + duration_us: rng.gen_range(0..30_000_000), + disconnect_timestamp: None, } } @@ -437,14 +544,17 @@ mod tests { ) -> Vec<(u64, usize, i64)> { let remote_storage_config = RemoteStorageConfig { storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()), + timeout: std::time::Duration::from_secs(120), }; let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap(); worker_inner(storage, rx, config).await.unwrap(); - let mut files = std::fs::read_dir(tmpdir.as_std_path()) - .unwrap() - .map(|entry| entry.unwrap().path()) + let mut files = WalkDir::new(tmpdir.as_std_path()) + .into_iter() + .filter_map(|entry| entry.ok()) + .filter(|entry| entry.file_type().is_file()) + .map(|entry| entry.path().to_path_buf()) .collect_vec(); files.sort(); @@ -485,16 +595,16 @@ mod tests { assert_eq!( file_stats, [ - (1029153, 3, 6000), - (1029075, 3, 6000), - (1029216, 3, 6000), - (1029129, 3, 6000), - (1029250, 3, 6000), - (1029017, 3, 6000), - (1029175, 3, 6000), - (1029247, 3, 6000), - (343124, 1, 2000) - ], + (1315874, 3, 6000), + (1315867, 3, 6000), + (1315927, 3, 6000), + (1315884, 3, 6000), + (1316014, 3, 6000), + (1315856, 3, 6000), + (1315648, 3, 6000), + (1315884, 3, 6000), + (438913, 1, 2000) + ] ); tmpdir.close().unwrap(); @@ -523,12 +633,12 @@ mod tests { assert_eq!( file_stats, [ - (1166201, 6, 12000), - (1163577, 6, 12000), - (1164641, 6, 12000), - (1168772, 6, 12000), - (196761, 1, 2000) - ], + (1223214, 5, 10000), + (1229364, 5, 10000), + (1231158, 5, 10000), + (1230520, 5, 10000), + (1221798, 5, 10000) + ] ); tmpdir.close().unwrap(); @@ -559,12 +669,12 @@ mod tests { assert_eq!( file_stats, [ - (1144934, 6, 12000), - (1144941, 6, 12000), - (1144735, 6, 12000), - (1144936, 6, 12000), - (191035, 1, 2000) - ], + (1208861, 5, 10000), + (1208592, 5, 10000), + (1208885, 5, 10000), + (1208873, 5, 10000), + (1209128, 5, 10000) + ] ); tmpdir.close().unwrap(); @@ -588,16 +698,16 @@ mod tests { assert_eq!( file_stats, [ - (1029153, 3, 6000), - (1029075, 3, 6000), - (1029216, 3, 6000), - (1029129, 3, 6000), - (1029250, 3, 6000), - (1029017, 3, 6000), - (1029175, 3, 6000), - (1029247, 3, 6000), - (343124, 1, 2000) - ], + (1315874, 3, 6000), + (1315867, 3, 6000), + (1315927, 3, 6000), + (1315884, 3, 6000), + (1316014, 3, 6000), + (1315856, 3, 6000), + (1315648, 3, 6000), + (1315884, 3, 6000), + (438913, 1, 2000) + ] ); tmpdir.close().unwrap(); @@ -633,7 +743,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(515807, 2, 3001), (515585, 2, 3000), (515425, 2, 2999)], + [(659836, 2, 3001), (659550, 2, 3000), (659346, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 5b2dd7ecfd..fdfe50a494 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,5 +1,7 @@ use std::{error::Error as StdError, fmt, io}; +use measured::FixedCardinalityLabel; + /// Upcast (almost) any error into an opaque [`io::Error`]. pub fn io_error(e: impl Into>) -> io::Error { io::Error::new(io::ErrorKind::Other, e) @@ -17,7 +19,7 @@ pub fn log_error(e: E) -> E { /// NOTE: This trait should not be implemented for [`anyhow::Error`], since it /// is way too convenient and tends to proliferate all across the codebase, /// ultimately leading to accidental leaks of sensitive data. -pub trait UserFacingError: fmt::Display { +pub trait UserFacingError: ReportableError { /// Format the error for client, stripping all sensitive info. /// /// Although this might be a no-op for many types, it's highly @@ -29,36 +31,63 @@ pub trait UserFacingError: fmt::Display { } } -#[derive(Clone)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)] +#[label(singleton = "type")] pub enum ErrorKind { /// Wrong password, unknown endpoint, protocol violation, etc... User, /// Network error between user and proxy. Not necessarily user error - Disconnect, + #[label(rename = "clientdisconnect")] + ClientDisconnect, - /// Proxy self-imposed rate limits + /// Proxy self-imposed user rate limits + #[label(rename = "ratelimit")] RateLimit, + /// Proxy self-imposed service-wise rate limits + #[label(rename = "serviceratelimit")] + ServiceRateLimit, + /// internal errors Service, /// Error communicating with control plane + #[label(rename = "controlplane")] ControlPlane, + /// Postgres error + Postgres, + /// Error communicating with compute Compute, } impl ErrorKind { - pub fn to_str(&self) -> &'static str { + pub fn to_metric_label(&self) -> &'static str { match self { - ErrorKind::User => "request failed due to user error", - ErrorKind::Disconnect => "client disconnected", - ErrorKind::RateLimit => "request cancelled due to rate limit", - ErrorKind::Service => "internal service error", - ErrorKind::ControlPlane => "non-retryable control plane error", - ErrorKind::Compute => "non-retryable compute error (or exhausted retry capacity)", + ErrorKind::User => "user", + ErrorKind::ClientDisconnect => "clientdisconnect", + ErrorKind::RateLimit => "ratelimit", + ErrorKind::ServiceRateLimit => "serviceratelimit", + ErrorKind::Service => "service", + ErrorKind::ControlPlane => "controlplane", + ErrorKind::Postgres => "postgres", + ErrorKind::Compute => "compute", + } + } +} + +pub trait ReportableError: fmt::Display + Send + 'static { + fn get_error_kind(&self) -> ErrorKind; +} + +impl ReportableError for tokio_postgres::error::Error { + fn get_error_kind(&self) -> ErrorKind { + if self.as_db_error().is_some() { + ErrorKind::Postgres + } else { + ErrorKind::Compute } } } diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 59e1492ed4..fc7400869f 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -4,7 +4,7 @@ pub mod health_server; -use std::{sync::Arc, time::Duration}; +use std::{str::FromStr, sync::Arc, time::Duration}; use futures::FutureExt; pub use reqwest::{Request, Response, StatusCode}; @@ -13,13 +13,16 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::time::Instant; use tracing::trace; -use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; +use crate::{ + metrics::{ConsoleRequest, Metrics}, + url::ApiUrl, +}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). /// We deliberately don't want to replace this with a public static. -pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware { +pub fn new_client() -> ClientWithMiddleware { let client = reqwest::ClientBuilder::new() .dns_resolver(Arc::new(GaiResolver::default())) .connection_verbose(true) @@ -28,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien reqwest_middleware::ClientBuilder::new(client) .with(reqwest_tracing::TracingMiddleware::default()) - .with(rate_limiter::Limiter::new(rate_limiter_config)) .build() } @@ -90,22 +92,23 @@ impl Endpoint { /// Execute a [request](reqwest::Request). pub async fn execute(&self, request: Request) -> Result { - let path = request.url().path().to_string(); - let start = Instant::now(); - let res = self.client.execute(request).await; - CONSOLE_REQUEST_LATENCY - .with_label_values(&[&path]) - .observe(start.elapsed().as_secs_f64()); - res + let _timer = Metrics::get() + .proxy + .console_request_latency + .start_timer(ConsoleRequest { + request: request.url().path(), + }); + + self.client.execute(request).await } } -/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html -use hyper::{ - client::connect::dns::{GaiResolver as HyperGaiResolver, Name}, - service::Service, +use hyper_util::client::legacy::connect::dns::{ + GaiResolver as HyperGaiResolver, Name as HyperName, }; -use reqwest::dns::{Addrs, Resolve, Resolving}; +use reqwest::dns::{Addrs, Name, Resolve, Resolving}; +/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html +use tower_service::Service; #[derive(Debug)] pub struct GaiResolver(HyperGaiResolver); @@ -118,11 +121,12 @@ impl Default for GaiResolver { impl Resolve for GaiResolver { fn resolve(&self, name: Name) -> Resolving { let this = &mut self.0.clone(); + let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid"); let start = Instant::now(); Box::pin( - Service::::call(this, name.clone()).map(move |result| { + Service::::call(this, hyper_name).map(move |result| { let resolve_duration = start.elapsed(); - trace!(duration = ?resolve_duration, addr = %name, "resolve host complete"); + trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete"); result .map(|addrs| -> Addrs { Box::new(addrs) }) .map_err(|err| -> Box { Box::new(err) }) diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 6186ddde0d..cae9eb5b97 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,23 +1,49 @@ use anyhow::{anyhow, bail}; -use hyper::{Body, Request, Response, StatusCode}; -use std::{convert::Infallible, net::TcpListener}; -use tracing::info; -use utils::http::{endpoint, error::ApiError, json::json_response, RouterBuilder, RouterService}; +use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; +use measured::{text::BufferedTextEncoder, MetricGroup}; +use metrics::NeonMetrics; +use std::{ + convert::Infallible, + net::TcpListener, + sync::{Arc, Mutex}, +}; +use tracing::{info, info_span}; +use utils::http::{ + endpoint::{self, request_span}, + error::ApiError, + json::json_response, + RouterBuilder, RouterService, +}; + +use crate::jemalloc; async fn status_handler(_: Request) -> Result, ApiError> { json_response(StatusCode::OK, "") } -fn make_router() -> RouterBuilder { - endpoint::make_router().get("/v1/status", status_handler) +fn make_router(metrics: AppMetrics) -> RouterBuilder { + let state = Arc::new(Mutex::new(PrometheusHandler { + encoder: BufferedTextEncoder::new(), + metrics, + })); + + endpoint::make_router() + .get("/metrics", move |r| { + let state = state.clone(); + request_span(r, move |b| prometheus_metrics_handler(b, state)) + }) + .get("/v1/status", status_handler) } -pub async fn task_main(http_listener: TcpListener) -> anyhow::Result { +pub async fn task_main( + http_listener: TcpListener, + metrics: AppMetrics, +) -> anyhow::Result { scopeguard::defer! { info!("http has shut down"); } - let service = || RouterService::new(make_router().build()?); + let service = || RouterService::new(make_router(metrics).build()?); hyper::Server::from_tcp(http_listener)? .serve(service().map_err(|e| anyhow!(e))?) @@ -25,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result bail!("hyper server without shutdown handling cannot shutdown successfully"); } + +struct PrometheusHandler { + encoder: BufferedTextEncoder, + metrics: AppMetrics, +} + +#[derive(MetricGroup)] +pub struct AppMetrics { + #[metric(namespace = "jemalloc")] + pub jemalloc: Option, + #[metric(flatten)] + pub neon_metrics: NeonMetrics, + #[metric(flatten)] + pub proxy: &'static crate::metrics::Metrics, +} + +async fn prometheus_metrics_handler( + _req: Request, + state: Arc>, +) -> Result, ApiError> { + let started_at = std::time::Instant::now(); + + let span = info_span!("blocking"); + let body = tokio::task::spawn_blocking(move || { + let _span = span.entered(); + + let mut state = state.lock().unwrap(); + let PrometheusHandler { encoder, metrics } = &mut *state; + + metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + + let body = encoder.finish(); + + tracing::info!( + bytes = body.len(), + elapsed_ms = started_at.elapsed().as_millis(), + "responded /metrics" + ); + + body + }) + .await + .unwrap(); + + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, "text/plain; version=0.0.4") + .body(Body::from(body)) + .unwrap(); + + Ok(response) +} diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs new file mode 100644 index 0000000000..e38135dd22 --- /dev/null +++ b/proxy/src/intern.rs @@ -0,0 +1,252 @@ +use std::{ + hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock, +}; + +use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; +use rustc_hash::FxHasher; + +use crate::{BranchId, EndpointId, ProjectId, RoleName}; + +pub trait InternId: Sized + 'static { + fn get_interner() -> &'static StringInterner; +} + +pub struct StringInterner { + inner: ThreadedRodeo>, + _id: PhantomData, +} + +#[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)] +pub struct InternedString { + inner: Spur, + _id: PhantomData, +} + +impl std::fmt::Display for InternedString { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.as_str().fmt(f) + } +} + +impl InternedString { + pub fn as_str(&self) -> &'static str { + Id::get_interner().inner.resolve(&self.inner) + } + pub fn get(s: &str) -> Option { + Id::get_interner().get(s) + } +} + +impl AsRef for InternedString { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl std::ops::Deref for InternedString { + type Target = str; + fn deref(&self) -> &str { + self.as_str() + } +} + +impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString { + fn deserialize>(d: D) -> Result { + struct Visitor(PhantomData); + impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor { + type Value = InternedString; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a string") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(Id::get_interner().get_or_intern(v)) + } + } + d.deserialize_str(Visitor::(PhantomData)) + } +} + +impl serde::Serialize for InternedString { + fn serialize(&self, s: S) -> Result { + self.as_str().serialize(s) + } +} + +impl StringInterner { + pub fn new() -> Self { + StringInterner { + inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher( + Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()), + // unbounded + MemoryLimits::for_memory_usage(usize::MAX), + BuildHasherDefault::::default(), + ), + _id: PhantomData, + } + } + + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + pub fn len(&self) -> usize { + self.inner.len() + } + + pub fn current_memory_usage(&self) -> usize { + self.inner.current_memory_usage() + } + + pub fn get_or_intern(&self, s: &str) -> InternedString { + InternedString { + inner: self.inner.get_or_intern(s), + _id: PhantomData, + } + } + + pub fn get(&self, s: &str) -> Option> { + Some(InternedString { + inner: self.inner.get(s)?, + _id: PhantomData, + }) + } +} + +impl Index> for StringInterner { + type Output = str; + + fn index(&self, index: InternedString) -> &Self::Output { + self.inner.resolve(&index.inner) + } +} + +impl Default for StringInterner { + fn default() -> Self { + Self::new() + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct RoleNameTag; +impl InternId for RoleNameTag { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type RoleNameInt = InternedString; +impl From<&RoleName> for RoleNameInt { + fn from(value: &RoleName) -> Self { + RoleNameTag::get_interner().get_or_intern(value) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct EndpointIdTag; +impl InternId for EndpointIdTag { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type EndpointIdInt = InternedString; +impl From<&EndpointId> for EndpointIdInt { + fn from(value: &EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(value) + } +} +impl From for EndpointIdInt { + fn from(value: EndpointId) -> Self { + EndpointIdTag::get_interner().get_or_intern(&value) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct BranchIdTag; +impl InternId for BranchIdTag { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type BranchIdInt = InternedString; +impl From<&BranchId> for BranchIdInt { + fn from(value: &BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(value) + } +} +impl From for BranchIdInt { + fn from(value: BranchId) -> Self { + BranchIdTag::get_interner().get_or_intern(&value) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub struct ProjectIdTag; +impl InternId for ProjectIdTag { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } +} +pub type ProjectIdInt = InternedString; +impl From<&ProjectId> for ProjectIdInt { + fn from(value: &ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(value) + } +} +impl From for ProjectIdInt { + fn from(value: ProjectId) -> Self { + ProjectIdTag::get_interner().get_or_intern(&value) + } +} + +#[cfg(test)] +mod tests { + use std::sync::OnceLock; + + use crate::intern::StringInterner; + + use super::InternId; + + struct MyId; + impl InternId for MyId { + fn get_interner() -> &'static StringInterner { + pub static ROLE_NAMES: OnceLock> = OnceLock::new(); + ROLE_NAMES.get_or_init(Default::default) + } + } + + #[test] + fn push_many_strings() { + use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand_distr::Zipf; + + let endpoint_dist = Zipf::new(500000, 0.8).unwrap(); + let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist); + + let interner = MyId::get_interner(); + + const N: usize = 100_000; + let mut verify = Vec::with_capacity(N); + for endpoint in endpoints.take(N) { + let endpoint = format!("ep-string-interning-{endpoint}"); + let key = interner.get_or_intern(&endpoint); + verify.push((endpoint, key)); + } + + for (s, key) in verify { + assert_eq!(interner[key], s); + } + + // 2031616/59861 = 34 bytes per string + assert_eq!(interner.len(), 59_861); + // will have other overhead for the internal hashmaps that are not accounted for. + assert_eq!(interner.current_memory_usage(), 2_031_616); + } +} diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs new file mode 100644 index 0000000000..3243e6a140 --- /dev/null +++ b/proxy/src/jemalloc.rs @@ -0,0 +1,116 @@ +use std::marker::PhantomData; + +use measured::{ + label::NoLabels, + metric::{ + gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder, + MetricEncoding, MetricFamilyEncoding, MetricType, + }, + text::TextEncoder, + LabelGroup, MetricGroup, +}; +use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; + +pub struct MetricRecorder { + epoch: epoch_mib, + inner: Metrics, +} + +#[derive(MetricGroup)] +struct Metrics { + active_bytes: JemallocGaugeFamily, + allocated_bytes: JemallocGaugeFamily, + mapped_bytes: JemallocGaugeFamily, + metadata_bytes: JemallocGaugeFamily, + resident_bytes: JemallocGaugeFamily, + retained_bytes: JemallocGaugeFamily, +} + +impl MetricGroup for MetricRecorder +where + Metrics: MetricGroup, +{ + fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> { + if self.epoch.advance().is_ok() { + self.inner.collect_group_into(enc)?; + } + Ok(()) + } +} + +impl MetricRecorder { + pub fn new() -> Result { + tracing::info!( + config = config::malloc_conf::read()?, + version = version::read()?, + "starting jemalloc recorder" + ); + + Ok(Self { + epoch: epoch::mib()?, + inner: Metrics { + active_bytes: JemallocGaugeFamily(stats::active::mib()?), + allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?), + mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?), + metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?), + resident_bytes: JemallocGaugeFamily(stats::resident::mib()?), + retained_bytes: JemallocGaugeFamily(stats::retained::mib()?), + }, + }) + } +} + +struct JemallocGauge(PhantomData); + +impl Default for JemallocGauge { + fn default() -> Self { + JemallocGauge(PhantomData) + } +} +impl MetricType for JemallocGauge { + type Metadata = T; +} + +struct JemallocGaugeFamily(T); +impl MetricFamilyEncoding for JemallocGaugeFamily +where + JemallocGauge: MetricEncoding, +{ + fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> { + JemallocGauge::write_type(&name, enc)?; + JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc) + } +} + +macro_rules! jemalloc_gauge { + ($stat:ident, $mib:ident) => { + impl MetricEncoding> for JemallocGauge { + fn write_type( + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + GaugeState::write_type(name, enc) + } + + fn collect_into( + &self, + mib: &stats::$mib, + labels: impl LabelGroup, + name: impl MetricNameEncoder, + enc: &mut TextEncoder, + ) -> Result<(), std::io::Error> { + if let Ok(v) = mib.read() { + enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?; + } + Ok(()) + } + } + }; +} + +jemalloc_gauge!(active, active_mib); +jemalloc_gauge!(allocated, allocated_mib); +jemalloc_gauge!(mapped, mapped_mib); +jemalloc_gauge!(metadata, metadata_mib); +jemalloc_gauge!(resident, resident_mib); +jemalloc_gauge!(retained, retained_mib); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 87ae8894e1..ea92eaaa55 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -3,6 +3,7 @@ use std::convert::Infallible; use anyhow::{bail, Context}; +use intern::{EndpointIdInt, EndpointIdTag, InternId}; use tokio::task::JoinError; use tokio_util::sync::CancellationToken; use tracing::warn; @@ -16,12 +17,15 @@ pub mod console; pub mod context; pub mod error; pub mod http; +pub mod intern; +pub mod jemalloc; pub mod logging; pub mod metrics; pub mod parse; pub mod protocol2; pub mod proxy; pub mod rate_limiter; +pub mod redis; pub mod sasl; pub mod scram; pub mod serverless; @@ -61,3 +65,121 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result(r: Result, JoinError>) -> anyhow::Result { r.context("join error").and_then(|x| x) } + +macro_rules! smol_str_wrapper { + ($name:ident) => { + #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] + pub struct $name(smol_str::SmolStr); + + impl $name { + pub fn as_str(&self) -> &str { + self.0.as_str() + } + } + + impl std::fmt::Display for $name { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } + } + + impl std::cmp::PartialEq for $name + where + smol_str::SmolStr: std::cmp::PartialEq, + { + fn eq(&self, other: &T) -> bool { + self.0.eq(other) + } + } + + impl From for $name + where + smol_str::SmolStr: From, + { + fn from(x: T) -> Self { + Self(x.into()) + } + } + + impl AsRef for $name { + fn as_ref(&self) -> &str { + self.0.as_ref() + } + } + + impl std::ops::Deref for $name { + type Target = str; + fn deref(&self) -> &str { + &*self.0 + } + } + + impl<'de> serde::de::Deserialize<'de> for $name { + fn deserialize>(d: D) -> Result { + >::deserialize(d).map(Self) + } + } + + impl serde::Serialize for $name { + fn serialize(&self, s: S) -> Result { + self.0.serialize(s) + } + } + }; +} + +const POOLER_SUFFIX: &str = "-pooler"; + +impl EndpointId { + fn normalize(&self) -> Self { + if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) { + stripped.into() + } else { + self.clone() + } + } + + fn normalize_intern(&self) -> EndpointIdInt { + if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) { + EndpointIdTag::get_interner().get_or_intern(stripped) + } else { + self.into() + } + } +} + +// 90% of role name strings are 20 characters or less. +smol_str_wrapper!(RoleName); +// 50% of endpoint strings are 23 characters or less. +smol_str_wrapper!(EndpointId); +// 50% of branch strings are 23 characters or less. +smol_str_wrapper!(BranchId); +// 90% of project strings are 23 characters or less. +smol_str_wrapper!(ProjectId); + +// will usually equal endpoint ID +smol_str_wrapper!(EndpointCacheKey); + +smol_str_wrapper!(DbName); + +// postgres hostname, will likely be a port:ip addr +smol_str_wrapper!(Host); + +// Endpoints are a bit tricky. Rare they might be branches or projects. +impl EndpointId { + pub fn is_endpoint(&self) -> bool { + self.0.starts_with("ep-") + } + pub fn is_branch(&self) -> bool { + self.0.starts_with("br-") + } + pub fn is_project(&self) -> bool { + !self.is_endpoint() && !self.is_branch() + } + pub fn as_branch(&self) -> BranchId { + BranchId(self.0.clone()) + } + pub fn as_project(&self) -> ProjectId { + ProjectId(self.0.clone()) + } +} diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 6e4cbb3f3a..e2a75a8720 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,238 +1,612 @@ -use ::metrics::{ - exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec, - IntCounterPairVec, IntCounterVec, -}; -use prometheus::{ - register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec, - IntGaugeVec, +use std::sync::{Arc, OnceLock}; + +use lasso::ThreadedRodeo; +use measured::{ + label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet}, + metric::{histogram::Thresholds, name::MetricName}, + Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, + LabelGroup, MetricGroup, }; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; -use once_cell::sync::Lazy; -use tokio::time; +use tokio::time::{self, Instant}; -pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_db_connections_total", - "Number of opened connections to a database.", - "proxy_closed_db_connections_total", - "Number of closed connections to a database.", - &["protocol"], - ) - .unwrap() -}); +use crate::console::messages::ColdStartInfo; -pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_client_connections_total", - "Number of opened connections from a client.", - "proxy_closed_client_connections_total", - "Number of closed connections from a client.", - &["protocol"], - ) - .unwrap() -}); +#[derive(MetricGroup)] +#[metric(new(thread_pool: Arc))] +pub struct Metrics { + #[metric(namespace = "proxy")] + #[metric(init = ProxyMetrics::new(thread_pool))] + pub proxy: ProxyMetrics, -pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_accepted_connections_total", - "Number of client connections accepted.", - "proxy_closed_connections_total", - "Number of client connections closed.", - &["protocol"], - ) - .unwrap() -}); + #[metric(namespace = "wake_compute_lock")] + pub wake_compute_lock: ApiLockMetrics, +} -pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_compute_connection_latency_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure - // 3 * 2 * 2 * 2 = 24 counters - &["protocol", "cache_miss", "pool_miss", "outcome"], - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), - ) - .unwrap() -}); +static SELF: OnceLock = OnceLock::new(); +impl Metrics { + pub fn install(thread_pool: Arc) { + SELF.set(Metrics::new(thread_pool)) + .ok() + .expect("proxy metrics must not be installed more than once"); + } -pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_console_request_latency", - "Time it took for proxy to establish a connection to the compute endpoint", - // proxy_wake_compute/proxy_get_role_info - &["request"], + pub fn get() -> &'static Self { + #[cfg(test)] + return SELF.get_or_init(|| Metrics::new(Arc::new(ThreadPoolMetrics::new(0)))); + + #[cfg(not(test))] + SELF.get() + .expect("proxy metrics must be installed by the main() function") + } +} + +#[derive(MetricGroup)] +#[metric(new(thread_pool: Arc))] +pub struct ProxyMetrics { + #[metric(flatten)] + pub db_connections: CounterPairVec, + #[metric(flatten)] + pub client_connections: CounterPairVec, + #[metric(flatten)] + pub connection_requests: CounterPairVec, + #[metric(flatten)] + pub http_endpoint_pools: HttpEndpointPools, + + /// Time it took for proxy to establish a connection to the compute endpoint. + // largest bucket = 2^16 * 0.5ms = 32s + #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))] + pub compute_connection_latency_seconds: HistogramVec, + + /// Time it took for proxy to receive a response from control plane. + #[metric( // largest bucket = 2^16 * 0.2ms = 13s - exponential_buckets(0.0002, 2.0, 16).unwrap(), - ) - .unwrap() -}); + metadata = Thresholds::exponential_buckets(0.0002, 2.0), + )] + pub console_request_latency: HistogramVec, -pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_allowed_ips_cache_misses", - "Number of cache hits/misses for allowed ips", - // hit/miss - &["outcome"], - ) - .unwrap() -}); + /// Time it takes to acquire a token to call console plane. + // largest bucket = 3^16 * 0.05ms = 2.15s + #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] + pub control_plane_token_acquire_seconds: Histogram<16>, -pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_control_plane_token_acquire_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(0.00005, 3.0, 16).unwrap(), - ) - .unwrap() -}); + /// Size of the HTTP request body lengths. + // smallest bucket = 16 bytes + // largest bucket = 4^12 * 16 bytes = 256MB + #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))] + pub http_conn_content_length_bytes: HistogramVec, 12>, -pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "semaphore_control_plane_limit", - "Current limit of the semaphore control plane", - &["limit"], // 2 counters - ) - .unwrap() -}); + /// Time it takes to reclaim unused connection pools. + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub http_pool_reclaimation_lag_seconds: Histogram<16>, -pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_by_sni", - "Number of connections (per sni).", - &["kind"], - ) - .unwrap() -}); + /// Number of opened connections to a database. + pub http_pool_opened_connections: Gauge, -pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_allowed_ips_number", - "Number of allowed ips", - vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], - ) - .unwrap() -}); + /// Number of cache hits/misses for allowed ips. + pub allowed_ips_cache_misses: CounterVec>, + + /// Number of allowed ips + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] + pub allowed_ips_number: Histogram<10>, + + /// Number of connections (per sni). + pub accepted_connections_by_sni: CounterVec>, + + /// Number of connection failures (per kind). + pub connection_failures_total: CounterVec>, + + /// Number of wake-up failures (per kind). + pub connection_failures_breakdown: CounterVec, + + /// Number of bytes sent/received between all clients and backends. + pub io_bytes: CounterVec>, + + /// Number of errors by a given classification. + pub errors_total: CounterVec>, + + /// Number of cancellation requests (per found/not_found). + pub cancellation_requests_total: CounterVec, + + /// Number of errors by a given classification + pub redis_errors_total: CounterVec, + + /// Number of TLS handshake failures + pub tls_handshake_failures: Counter, + + /// Number of connection requests affected by authentication rate limits + pub requests_auth_rate_limits_total: Counter, + + /// HLL approximate cardinality of endpoints that are connecting + pub connecting_endpoints: HyperLogLogVec, 32>, + + /// Number of endpoints affected by errors of a given classification + pub endpoints_affected_by_errors: HyperLogLogVec, 32>, + + /// Number of endpoints affected by authentication rate limits + pub endpoints_auth_rate_limits: HyperLogLog<32>, + + /// Number of invalid endpoints (per protocol, per rejected). + pub invalid_endpoints_total: CounterVec, + + /// Number of retries (per outcome, per retry_type). + #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))] + pub retries_metric: HistogramVec, + + /// Number of events consumed from redis (per event type). + pub redis_events_count: CounterVec>, + + #[metric(namespace = "connect_compute_lock")] + pub connect_compute_lock: ApiLockMetrics, + + #[metric(namespace = "scram_pool")] + #[metric(init = thread_pool)] + pub scram_pool: Arc, +} + +#[derive(MetricGroup)] +#[metric(new())] +pub struct ApiLockMetrics { + /// Number of semaphores registered in this api lock + pub semaphores_registered: Counter, + /// Number of semaphores unregistered in this api lock + pub semaphores_unregistered: Counter, + /// Time it takes to reclaim unused semaphores in the api lock + #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))] + pub reclamation_lag_seconds: Histogram<16>, + /// Time it takes to acquire a semaphore lock + #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))] + pub semaphore_acquire_seconds: Histogram<16>, +} + +impl Default for ApiLockMetrics { + fn default() -> Self { + Self::new() + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum HttpDirection { + Request, + Response, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "direction")] +pub enum Direction { + Tx, + Rx, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "protocol")] +pub enum Protocol { + Http, + Ws, + Tcp, + SniRouter, +} + +impl Protocol { + pub fn as_str(&self) -> &'static str { + match self { + Protocol::Http => "http", + Protocol::Ws => "ws", + Protocol::Tcp => "tcp", + Protocol::SniRouter => "sni_router", + } + } +} + +impl std::fmt::Display for Protocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum Bool { + True, + False, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum Outcome { + Success, + Failed, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "outcome")] +pub enum CacheOutcome { + Hit, + Miss, +} + +#[derive(LabelGroup)] +#[label(set = ConsoleRequestSet)] +pub struct ConsoleRequest<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub request: &'a str, +} + +#[derive(MetricGroup, Default)] +pub struct HttpEndpointPools { + /// Number of endpoints we have registered pools for + pub http_pool_endpoints_registered_total: Counter, + /// Number of endpoints we have unregistered pools for + pub http_pool_endpoints_unregistered_total: Counter, +} + +pub struct HttpEndpointPoolsGuard<'a> { + dec: &'a Counter, +} + +impl Drop for HttpEndpointPoolsGuard<'_> { + fn drop(&mut self) { + self.dec.inc(); + } +} + +impl HttpEndpointPools { + pub fn guard(&self) -> HttpEndpointPoolsGuard { + self.http_pool_endpoints_registered_total.inc(); + HttpEndpointPoolsGuard { + dec: &self.http_pool_endpoints_unregistered_total, + } + } +} +pub struct NumDbConnectionsGauge; +impl CounterPairAssoc for NumDbConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total"); + const INC_HELP: &'static str = "Number of opened connections to a database."; + const DEC_HELP: &'static str = "Number of closed connections to a database."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>; + +pub struct NumClientConnectionsGauge; +impl CounterPairAssoc for NumClientConnectionsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total"); + const INC_HELP: &'static str = "Number of opened connections from a client."; + const DEC_HELP: &'static str = "Number of closed connections from a client."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumClientConnectionsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>; + +pub struct NumConnectionRequestsGauge; +impl CounterPairAssoc for NumConnectionRequestsGauge { + const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total"); + const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total"); + const INC_HELP: &'static str = "Number of client connections accepted."; + const DEC_HELP: &'static str = "Number of client connections closed."; + type LabelGroupSet = StaticLabelSet; +} +pub type NumConnectionRequestsGuard<'a> = + metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>; + +#[derive(LabelGroup)] +#[label(set = ComputeConnectionLatencySet)] +pub struct ComputeConnectionLatencyGroup { + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, + excluded: LatencyExclusions, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum LatencyExclusions { + Client, + ClientAndCplane, + ClientCplaneCompute, + ClientCplaneComputeRetry, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum SniKind { + Sni, + NoSni, + PasswordHack, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum ConnectionFailureKind { + ComputeCached, + ComputeUncached, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +#[label(singleton = "kind")] +pub enum WakeupFailureKind { + BadComputeAddress, + ApiTransportError, + QuotaExceeded, + ApiConsoleLocked, + ApiConsoleBadRequest, + ApiConsoleOtherServerError, + ApiConsoleOtherError, + TimeoutError, +} + +#[derive(LabelGroup)] +#[label(set = ConnectionFailuresBreakdownSet)] +pub struct ConnectionFailuresBreakdownGroup { + pub kind: WakeupFailureKind, + pub retry: Bool, +} + +#[derive(LabelGroup, Copy, Clone)] +#[label(set = RedisErrorsSet)] +pub struct RedisErrors<'a> { + #[label(dynamic_with = ThreadedRodeo, default)] + pub channel: &'a str, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationSource { + FromClient, + FromRedis, + Local, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub enum CancellationOutcome { + NotFound, + Found, +} + +#[derive(LabelGroup)] +#[label(set = CancellationRequestSet)] +pub struct CancellationRequest { + pub source: CancellationSource, + pub kind: CancellationOutcome, +} + +pub enum Waiting { + Cplane, + Client, + Compute, + RetryTimeout, +} + +#[derive(Default)] +struct Accumulated { + cplane: time::Duration, + client: time::Duration, + compute: time::Duration, + retry: time::Duration, +} -#[derive(Clone)] pub struct LatencyTimer { // time since the stopwatch was started - start: Option, + start: time::Instant, + // time since the stopwatch was stopped + stop: Option, // accumulated time on the stopwatch - pub accumulated: std::time::Duration, + accumulated: Accumulated, // label data - protocol: &'static str, - cache_miss: bool, - pool_miss: bool, - outcome: &'static str, + protocol: Protocol, + cold_start_info: ColdStartInfo, + outcome: ConnectOutcome, } pub struct LatencyTimerPause<'a> { timer: &'a mut LatencyTimer, + start: time::Instant, + waiting_for: Waiting, } impl LatencyTimer { - pub fn new(protocol: &'static str) -> Self { + pub fn new(protocol: Protocol) -> Self { Self { - start: Some(time::Instant::now()), - accumulated: std::time::Duration::ZERO, + start: time::Instant::now(), + stop: None, + accumulated: Accumulated::default(), protocol, - cache_miss: false, - // by default we don't do pooling - pool_miss: true, + cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified - outcome: "failed", + outcome: ConnectOutcome::Failed, } } - pub fn pause(&mut self) -> LatencyTimerPause<'_> { - // stop the stopwatch and record the time that we have accumulated - let start = self.start.take().expect("latency timer should be started"); - self.accumulated += start.elapsed(); - LatencyTimerPause { timer: self } + pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> { + LatencyTimerPause { + timer: self, + start: Instant::now(), + waiting_for, + } } - pub fn cache_miss(&mut self) { - self.cache_miss = true; - } - - pub fn pool_hit(&mut self) { - self.pool_miss = false; + pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) { + self.cold_start_info = cold_start_info; } pub fn success(&mut self) { // stop the stopwatch and record the time that we have accumulated - let start = self.start.take().expect("latency timer should be started"); - self.accumulated += start.elapsed(); + self.stop = Some(time::Instant::now()); // success - self.outcome = "success"; + self.outcome = ConnectOutcome::Success; } } impl Drop for LatencyTimerPause<'_> { fn drop(&mut self) { - // start the stopwatch again - self.timer.start = Some(time::Instant::now()); + let dur = self.start.elapsed(); + match self.waiting_for { + Waiting::Cplane => self.timer.accumulated.cplane += dur, + Waiting::Client => self.timer.accumulated.client += dur, + Waiting::Compute => self.timer.accumulated.compute += dur, + Waiting::RetryTimeout => self.timer.accumulated.retry += dur, + } } } +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum ConnectOutcome { + Success, + Failed, +} + impl Drop for LatencyTimer { fn drop(&mut self) { - let duration = - self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated; - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, - ]) - .observe(duration.as_secs_f64()) + let duration = self + .stop + .unwrap_or_else(time::Instant::now) + .duration_since(self.start); + + let metric = &Metrics::get().proxy.compute_connection_latency_seconds; + + // Excluding client communication from the accumulated time. + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::Client, + }, + duration + .saturating_sub(self.accumulated.client) + .as_secs_f64(), + ); + + // Exclude client and cplane communication from the accumulated time. + let accumulated_total = self.accumulated.client + self.accumulated.cplane; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientAndCplane, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); + + // Exclude client cplane, compue communication from the accumulated time. + let accumulated_total = + self.accumulated.client + self.accumulated.cplane + self.accumulated.compute; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientCplaneCompute, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); + + // Exclude client cplane, compue, retry communication from the accumulated time. + let accumulated_total = self.accumulated.client + + self.accumulated.cplane + + self.accumulated.compute + + self.accumulated.retry; + metric.observe( + ComputeConnectionLatencyGroup { + protocol: self.protocol, + cold_start_info: self.cold_start_info, + outcome: self.outcome, + excluded: LatencyExclusions::ClientCplaneComputeRetry, + }, + duration.saturating_sub(accumulated_total).as_secs_f64(), + ); } } -pub static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_total", - "Number of connection failures (per kind).", - &["kind"], - ) - .unwrap() -}); - -pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_breakdown", - "Number of wake-up failures (per kind).", - &["retry", "kind"], - ) - .unwrap() -}); - -pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes_per_client", - "Number of bytes sent/received between client and backend.", - crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS, - ) - .unwrap() -}); - -pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes", - "Number of bytes sent/received between all clients and backends.", - &["direction"], - ) - .unwrap() -}); - -pub const fn bool_to_str(x: bool) -> &'static str { - if x { - "true" - } else { - "false" +impl From for Bool { + fn from(value: bool) -> Self { + if value { + Bool::True + } else { + Bool::False + } } } + +#[derive(LabelGroup)] +#[label(set = InvalidEndpointsSet)] +pub struct InvalidEndpointsGroup { + pub protocol: Protocol, + pub rejected: Bool, + pub outcome: ConnectOutcome, +} + +#[derive(LabelGroup)] +#[label(set = RetriesMetricSet)] +pub struct RetriesMetricGroup { + pub outcome: ConnectOutcome, + pub retry_type: RetryType, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +pub enum RetryType { + WakeCompute, + ConnectToCompute, +} + +#[derive(FixedCardinalityLabel, Clone, Copy, Debug)] +#[label(singleton = "event")] +pub enum RedisEventsCount { + EndpointCreated, + BranchCreated, + ProjectCreated, + CancelSession, + PasswordUpdate, + AllowedIpsUpdate, +} + +pub struct ThreadPoolWorkers(usize); +pub struct ThreadPoolWorkerId(pub usize); + +impl LabelValue for ThreadPoolWorkerId { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0 as i64) + } +} + +impl LabelGroup for ThreadPoolWorkerId { + fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) { + v.write_value(LabelName::from_str("worker"), self); + } +} + +impl LabelSet for ThreadPoolWorkers { + type Value<'a> = ThreadPoolWorkerId; + + fn dynamic_cardinality(&self) -> Option { + Some(self.0) + } + + fn encode(&self, value: Self::Value<'_>) -> Option { + (value.0 < self.0).then_some(value.0) + } + + fn decode(&self, value: usize) -> Self::Value<'_> { + ThreadPoolWorkerId(value) + } +} + +impl FixedCardinalitySet for ThreadPoolWorkers { + fn cardinality(&self) -> usize { + self.0 + } +} + +#[derive(MetricGroup)] +#[metric(new(workers: usize))] +pub struct ThreadPoolMetrics { + pub injector_queue_depth: Gauge, + #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_queue_depth: GaugeVec, + #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_task_turns_total: CounterVec, + #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] + pub worker_task_skips_total: CounterVec, +} diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 1d8931be85..1dd4563514 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,43 +1,26 @@ //! Proxy Protocol V2 implementation use std::{ - future::poll_fn, - future::Future, io, net::SocketAddr, - pin::{pin, Pin}, - task::{ready, Context, Poll}, + pin::Pin, + task::{Context, Poll}, }; -use bytes::{Buf, BytesMut}; -use hyper::server::conn::{AddrIncoming, AddrStream}; +use bytes::BytesMut; use pin_project_lite::pin_project; -use tls_listener::AsyncAccept; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf}; -pub struct ProxyProtocolAccept { - pub incoming: AddrIncoming, -} - pin_project! { - pub struct WithClientIp { + /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough + pub struct ChainRW { #[pin] pub inner: T, buf: BytesMut, - tlv_bytes: u16, - state: ProxyParse, } } -#[derive(Clone, PartialEq, Debug)] -enum ProxyParse { - NotStarted, - - Finished(SocketAddr), - None, -} - -impl AsyncWrite for WithClientIp { +impl AsyncWrite for ChainRW { #[inline] fn poll_write( self: Pin<&mut Self>, @@ -72,285 +55,174 @@ impl AsyncWrite for WithClientIp { } } -impl WithClientIp { - pub fn new(inner: T) -> Self { - WithClientIp { - inner, - buf: BytesMut::with_capacity(128), - tlv_bytes: 0, - state: ProxyParse::NotStarted, - } - } - - pub fn client_addr(&self) -> Option { - match self.state { - ProxyParse::Finished(socket) => Some(socket), - _ => None, - } - } -} - -impl WithClientIp { - pub async fn wait_for_addr(&mut self) -> io::Result> { - match self.state { - ProxyParse::NotStarted => { - let mut pin = Pin::new(&mut *self); - let addr = poll_fn(|cx| pin.as_mut().poll_client_ip(cx)).await?; - match addr { - Some(addr) => self.state = ProxyParse::Finished(addr), - None => self.state = ProxyParse::None, - } - Ok(addr) - } - ProxyParse::Finished(addr) => Ok(Some(addr)), - ProxyParse::None => Ok(None), - } - } -} - /// Proxy Protocol Version 2 Header const HEADER: [u8; 12] = [ 0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A, ]; -impl WithClientIp { - /// implementation of - /// Version 2 (Binary Format) - fn poll_client_ip( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - // The binary header format starts with a constant 12 bytes block containing the protocol signature : - // \x0D \x0A \x0D \x0A \x00 \x0D \x0A \x51 \x55 \x49 \x54 \x0A - while self.buf.len() < 16 { - let mut this = self.as_mut().project(); - let bytes_read = pin!(this.inner.read_buf(this.buf)).poll(cx)?; +pub async fn read_proxy_protocol( + mut read: T, +) -> std::io::Result<(ChainRW, Option)> { + let mut buf = BytesMut::with_capacity(128); + while buf.len() < 16 { + let bytes_read = read.read_buf(&mut buf).await?; - // exit for bad header - let len = usize::min(self.buf.len(), HEADER.len()); - if self.buf[..len] != HEADER[..len] { - return Poll::Ready(Ok(None)); - } - - // if no more bytes available then exit - if ready!(bytes_read) == 0 { - return Poll::Ready(Ok(None)); - }; + // exit for bad header + let len = usize::min(buf.len(), HEADER.len()); + if buf[..len] != HEADER[..len] { + return Ok((ChainRW { inner: read, buf }, None)); } - // The next byte (the 13th one) is the protocol version and command. - // The highest four bits contains the version. As of this specification, it must - // always be sent as \x2 and the receiver must only accept this value. - let vc = self.buf[12]; - let version = vc >> 4; - let command = vc & 0b1111; - if version != 2 { - return Poll::Ready(Err(io::Error::new( + // if no more bytes available then exit + if bytes_read == 0 { + return Ok((ChainRW { inner: read, buf }, None)); + }; + } + + let header = buf.split_to(16); + + // The next byte (the 13th one) is the protocol version and command. + // The highest four bits contains the version. As of this specification, it must + // always be sent as \x2 and the receiver must only accept this value. + let vc = header[12]; + let version = vc >> 4; + let command = vc & 0b1111; + if version != 2 { + return Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol version. expected version 2", + )); + } + match command { + // the connection was established on purpose by the proxy + // without being relayed. The connection endpoints are the sender and the + // receiver. Such connections exist when the proxy sends health-checks to the + // server. The receiver must accept this connection as valid and must use the + // real connection endpoints and discard the protocol block including the + // family which is ignored. + 0 => {} + // the connection was established on behalf of another node, + // and reflects the original connection endpoints. The receiver must then use + // the information provided in the protocol block to get original the address. + 1 => {} + // other values are unassigned and must not be emitted by senders. Receivers + // must drop connections presenting unexpected values here. + _ => { + return Err(io::Error::new( io::ErrorKind::Other, - "invalid proxy protocol version. expected version 2", - ))); + "invalid proxy protocol command. expected local (0) or proxy (1)", + )) } - match command { - // the connection was established on purpose by the proxy - // without being relayed. The connection endpoints are the sender and the - // receiver. Such connections exist when the proxy sends health-checks to the - // server. The receiver must accept this connection as valid and must use the - // real connection endpoints and discard the protocol block including the - // family which is ignored. - 0 => {} - // the connection was established on behalf of another node, - // and reflects the original connection endpoints. The receiver must then use - // the information provided in the protocol block to get original the address. - 1 => {} - // other values are unassigned and must not be emitted by senders. Receivers - // must drop connections presenting unexpected values here. - _ => { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "invalid proxy protocol command. expected local (0) or proxy (1)", - ))) - } - }; + }; - // The 14th byte contains the transport protocol and address family. The highest 4 - // bits contain the address family, the lowest 4 bits contain the protocol. - let ft = self.buf[13]; - let address_length = match ft { - // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET - // protocol family. Address length is 2*4 + 2*2 = 12 bytes. - // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET - // protocol family. Address length is 2*4 + 2*2 = 12 bytes. - 0x11 | 0x12 => 12, - // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6 - // protocol family. Address length is 2*16 + 2*2 = 36 bytes. - // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6 - // protocol family. Address length is 2*16 + 2*2 = 36 bytes. - 0x21 | 0x22 => 36, - // unspecified or unix stream. ignore the addresses - _ => 0, - }; + // The 14th byte contains the transport protocol and address family. The highest 4 + // bits contain the address family, the lowest 4 bits contain the protocol. + let ft = header[13]; + let address_length = match ft { + // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET + // protocol family. Address length is 2*4 + 2*2 = 12 bytes. + 0x11 | 0x12 => 12, + // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6 + // protocol family. Address length is 2*16 + 2*2 = 36 bytes. + 0x21 | 0x22 => 36, + // unspecified or unix stream. ignore the addresses + _ => 0, + }; - // The 15th and 16th bytes is the address length in bytes in network endian order. - // It is used so that the receiver knows how many address bytes to skip even when - // it does not implement the presented protocol. Thus the length of the protocol - // header in bytes is always exactly 16 + this value. When a sender presents a - // LOCAL connection, it should not present any address so it sets this field to - // zero. Receivers MUST always consider this field to skip the appropriate number - // of bytes and must not assume zero is presented for LOCAL connections. When a - // receiver accepts an incoming connection showing an UNSPEC address family or - // protocol, it may or may not decide to log the address information if present. - let remaining_length = u16::from_be_bytes(self.buf[14..16].try_into().unwrap()); - if remaining_length < address_length { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::Other, - "invalid proxy protocol length. not enough to fit requested IP addresses", - ))); + // The 15th and 16th bytes is the address length in bytes in network endian order. + // It is used so that the receiver knows how many address bytes to skip even when + // it does not implement the presented protocol. Thus the length of the protocol + // header in bytes is always exactly 16 + this value. When a sender presents a + // LOCAL connection, it should not present any address so it sets this field to + // zero. Receivers MUST always consider this field to skip the appropriate number + // of bytes and must not assume zero is presented for LOCAL connections. When a + // receiver accepts an incoming connection showing an UNSPEC address family or + // protocol, it may or may not decide to log the address information if present. + let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap()); + if remaining_length < address_length { + return Err(io::Error::new( + io::ErrorKind::Other, + "invalid proxy protocol length. not enough to fit requested IP addresses", + )); + } + drop(header); + + while buf.len() < remaining_length as usize { + if read.read_buf(&mut buf).await? == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "stream closed while waiting for proxy protocol addresses", + )); } - - while self.buf.len() < 16 + address_length as usize { - let mut this = self.as_mut().project(); - if ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?) == 0 { - return Poll::Ready(Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "stream closed while waiting for proxy protocol addresses", - ))); - } - } - - let this = self.as_mut().project(); - - // we are sure this is a proxy protocol v2 entry and we have read all the bytes we need - // discard the header we have parsed - this.buf.advance(16); - - // Starting from the 17th byte, addresses are presented in network byte order. - // The address order is always the same : - // - source layer 3 address in network byte order - // - destination layer 3 address in network byte order - // - source layer 4 address if any, in network byte order (port) - // - destination layer 4 address if any, in network byte order (port) - let addresses = this.buf.split_to(address_length as usize); - let socket = match address_length { - 12 => { - let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap(); - let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap()); - Some(SocketAddr::from((src_addr, src_port))) - } - 36 => { - let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap(); - let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap()); - Some(SocketAddr::from((src_addr, src_port))) - } - _ => None, - }; - - *this.tlv_bytes = remaining_length - address_length; - self.as_mut().skip_tlv_inner(); - - Poll::Ready(Ok(socket)) } - #[cold] - fn read_ip(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let ip = ready!(self.as_mut().poll_client_ip(cx)?); - match ip { - Some(x) => *self.as_mut().project().state = ProxyParse::Finished(x), - None => *self.as_mut().project().state = ProxyParse::None, + // Starting from the 17th byte, addresses are presented in network byte order. + // The address order is always the same : + // - source layer 3 address in network byte order + // - destination layer 3 address in network byte order + // - source layer 4 address if any, in network byte order (port) + // - destination layer 4 address if any, in network byte order (port) + let addresses = buf.split_to(remaining_length as usize); + let socket = match address_length { + 12 => { + let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) } - Poll::Ready(Ok(())) - } + 36 => { + let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap(); + let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap()); + Some(SocketAddr::from((src_addr, src_port))) + } + _ => None, + }; - #[cold] - fn skip_tlv(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let mut this = self.as_mut().project(); - // we know that this.buf is empty - debug_assert_eq!(this.buf.len(), 0); - - this.buf.reserve((*this.tlv_bytes).clamp(0, 1024) as usize); - ready!(pin!(this.inner.read_buf(this.buf)).poll(cx)?); - self.skip_tlv_inner(); - - Poll::Ready(Ok(())) - } - - fn skip_tlv_inner(self: Pin<&mut Self>) { - let tlv_bytes_read = match u16::try_from(self.buf.len()) { - // we read more than u16::MAX therefore we must have read the full tlv_bytes - Err(_) => self.tlv_bytes, - // we might not have read the full tlv bytes yet - Ok(n) => u16::min(n, self.tlv_bytes), - }; - let this = self.project(); - *this.tlv_bytes -= tlv_bytes_read; - this.buf.advance(tlv_bytes_read as usize); - } + Ok((ChainRW { inner: read, buf }, socket)) } -impl AsyncRead for WithClientIp { +impl AsyncRead for ChainRW { #[inline] fn poll_read( - mut self: Pin<&mut Self>, + self: Pin<&mut Self>, cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - // I'm assuming these 3 comparisons will be easy to branch predict. - // especially with the cold attributes - // which should make this read wrapper almost invisible - - if let ProxyParse::NotStarted = self.state { - ready!(self.as_mut().read_ip(cx)?); - } - - while self.tlv_bytes > 0 { - ready!(self.as_mut().skip_tlv(cx)?) - } - - let this = self.project(); - if this.buf.is_empty() { - this.inner.poll_read(cx, buf) + if self.buf.is_empty() { + self.project().inner.poll_read(cx, buf) } else { - // we know that tlv_bytes is 0 - debug_assert_eq!(*this.tlv_bytes, 0); - - let write = usize::min(this.buf.len(), buf.remaining()); - let slice = this.buf.split_to(write).freeze(); - buf.put_slice(&slice); - - // reset the allocation so it can be freed - if this.buf.is_empty() { - *this.buf = BytesMut::new(); - } - - Poll::Ready(Ok(())) + self.read_from_buf(buf) } } } -impl AsyncAccept for ProxyProtocolAccept { - type Connection = WithClientIp; +impl ChainRW { + #[cold] + fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll> { + debug_assert!(!self.buf.is_empty()); + let this = self.project(); - type Error = io::Error; + let write = usize::min(this.buf.len(), buf.remaining()); + let slice = this.buf.split_to(write).freeze(); + buf.put_slice(&slice); - fn poll_accept( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll>> { - let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?); - let Some(conn) = conn else { - return Poll::Ready(None); - }; + // reset the allocation so it can be freed + if this.buf.is_empty() { + *this.buf = BytesMut::new(); + } - Poll::Ready(Some(Ok(WithClientIp::new(conn)))) + Poll::Ready(Ok(())) } } #[cfg(test)] mod tests { - use std::pin::pin; - use tokio::io::AsyncReadExt; - use crate::protocol2::{ProxyParse, WithClientIp}; + use crate::protocol2::read_proxy_protocol; #[tokio::test] async fn test_ipv4() { @@ -372,16 +244,15 @@ mod tests { let extra_data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); - assert_eq!( - read.state, - ProxyParse::Finished(([127, 0, 0, 1], 65535).into()) - ); + assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into())); } #[tokio::test] @@ -404,17 +275,17 @@ mod tests { let extra_data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); assert_eq!( - read.state, - ProxyParse::Finished( - ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into() - ) + addr, + Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()) ); } @@ -422,24 +293,24 @@ mod tests { async fn test_invalid() { let data = [0x55; 256]; - let mut read = pin!(WithClientIp::new(data.as_slice())); + let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, data); - assert_eq!(read.state, ProxyParse::None); + assert_eq!(addr, None); } #[tokio::test] async fn test_short() { let data = [0x55; 10]; - let mut read = pin!(WithClientIp::new(data.as_slice())); + let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, data); - assert_eq!(read.state, ProxyParse::None); + assert_eq!(addr, None); } #[tokio::test] @@ -465,15 +336,14 @@ mod tests { let extra_data = [0xaa; 256]; - let mut read = pin!(WithClientIp::new(header.chain(extra_data.as_slice()))); + let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice())) + .await + .unwrap(); let mut bytes = vec![]; read.read_to_end(&mut bytes).await.unwrap(); assert_eq!(bytes, extra_data); - assert_eq!( - read.state, - ProxyParse::Finished(([55, 56, 57, 58], 65535).into()) - ); + assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into())); } } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 4aba222082..072f51958f 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -2,40 +2,45 @@ mod tests; pub mod connect_compute; +mod copy_bidirectional; +pub mod handshake; +pub mod passthrough; pub mod retry; +pub mod wake_compute; +pub use copy_bidirectional::copy_bidirectional_client_compute; use crate::{ auth, - cancellation::{self, CancelMap}, + cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, compute, - config::{AuthenticationConfig, ProxyConfig, TlsConfig}, - console::{self, messages::MetricsAuxInfo}, + config::{ProxyConfig, TlsConfig}, context::RequestMonitoring, - metrics::{ - NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER, - NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE, - }, - protocol2::WithClientIp, + error::ReportableError, + metrics::{Metrics, NumClientConnectionsGuard}, + protocol2::read_proxy_protocol, + proxy::handshake::{handshake, HandshakeData}, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, - usage_metrics::{Ids, USAGE_METRICS}, + EndpointCacheKey, }; -use anyhow::{bail, Context}; use futures::TryFutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; -use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; +use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; +use smol_str::{format_smolstr, SmolStr}; use std::sync::Arc; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, Instrument}; -use utils::measured_stream::MeasuredStream; +use tracing::{error, info, Instrument}; -use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::{ + connect_compute::{connect_to_compute, TcpMechanism}, + passthrough::ProxyPassthrough, +}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; -const ERR_PROTO_VIOLATION: &str = "protocol violation"; pub async fn run_until_cancelled( f: F, @@ -56,6 +61,7 @@ pub async fn task_main( config: &'static ProxyConfig, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { @@ -67,57 +73,88 @@ pub async fn task_main( socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); - let cancel_map = Arc::new(CancelMap::default()); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await { let (socket, peer_addr) = accept_result?; + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); + let session_id = uuid::Uuid::new_v4(); - let cancel_map = Arc::clone(&cancel_map); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + let cancellation_handler = Arc::clone(&cancellation_handler); - connections.spawn( - async move { - info!("accepted postgres client connection"); + tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); - let mut socket = WithClientIp::new(socket); - let mut peer_addr = peer_addr.ip(); - if let Some(addr) = socket.wait_for_addr().await? { - peer_addr = addr.ip(); - tracing::Span::current().record("peer_addr", &tracing::field::display(addr)); - } else if config.require_client_ip { - bail!("missing required client IP"); + connections.spawn(async move { + let (socket, peer_addr) = match read_proxy_protocol(socket).await { + Ok((socket, Some(addr))) => (socket, addr.ip()), + Err(e) => { + error!("per-client task finished with an error: {e:#}"); + return; } + Ok((_socket, None)) if config.require_client_ip => { + error!("missing required client IP"); + return; + } + Ok((socket, None)) => (socket, peer_addr.ip()), + }; - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region); + match socket.inner.set_nodelay(true) { + Ok(()) => {} + Err(e) => { + error!("per-client task finished with an error: failed to set socket option: {e:#}"); + return; + } + }; - socket - .inner - .set_nodelay(true) - .context("failed to set socket option")?; + let mut ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); + let span = ctx.span.clone(); + let startup = Box::pin( handle_client( config, &mut ctx, - &cancel_map, + cancellation_handler, socket, ClientMode::Tcp, - endpoint_rate_limiter, + endpoint_rate_limiter2, + conn_gauge, ) - .await + .instrument(span.clone()), + ); + let res = startup.await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + error!(parent: &span, "per-client task finished with an error: {e:#}"); + } + Ok(None) => { + ctx.set_success(); + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log_connect(); + match p.proxy_pass().instrument(span.clone()).await { + Ok(()) => {} + Err(e) => { + error!(parent: &span, "per-client task finished with an error: {e:#}"); + } + } + } } - .instrument(info_span!( - "handle_client", - ?session_id, - peer_addr = tracing::field::Empty - )) - .unwrap_or_else(move |e| { - // Acknowledge that the task has finished with an error. - error!(?session_id, "per-client task finished with an error: {e:#}"); - }), - ); + }); } connections.close(); @@ -136,14 +173,14 @@ pub enum ClientMode { /// Abstracts the logic of handling TCP vs WS clients impl ClientMode { - fn allow_cleartext(&self) -> bool { + pub fn allow_cleartext(&self) -> bool { match self { ClientMode::Tcp => false, ClientMode::Websockets { .. } => true, } } - fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { + pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool { match self { ClientMode::Tcp => config.allow_self_signed_compute, ClientMode::Websockets { .. } => false, @@ -166,162 +203,152 @@ impl ClientMode { } } -pub async fn handle_client( - config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, - cancel_map: &CancelMap, - stream: S, - mode: ClientMode, - endpoint_rate_limiter: Arc, -) -> anyhow::Result<()> { - info!( - protocol = ctx.protocol, - "handling interactive connection from client" - ); - - let proto = ctx.protocol; - let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&[proto]) - .guard(); - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&[proto]) - .guard(); - - let tls = config.tls_config.as_ref(); - - let pause = ctx.latency_timer.pause(); - let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map); - let (mut stream, params) = match do_handshake.await? { - Some(x) => x, - None => return Ok(()), // it's a cancellation request - }; - drop(pause); - - // Extract credentials which we're going to use for auth. - let creds = { - let hostname = mode.hostname(stream.get_ref()); - - let common_names = tls.and_then(|tls| tls.common_names.clone()); - let result = config - .auth_backend - .as_ref() - .map(|_| auth::ClientCredentials::parse(ctx, ¶ms, hostname, common_names)) - .transpose(); - - match result { - Ok(creds) => creds, - Err(e) => stream.throw_error(e).await?, - } - }; - - ctx.set_endpoint_id(creds.get_endpoint()); - - let client = Client::new( - stream, - creds, - ¶ms, - mode.allow_self_signed_compute(config), - endpoint_rate_limiter, - ); - cancel_map - .with_session(|session| { - client.connect_to_db(ctx, session, mode, &config.authentication_config) - }) - .await +#[derive(Debug, Error)] +// almost all errors should be reported to the user, but there's a few cases where we cannot +// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons +// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation, +// we cannot be sure the client even understands our error message +// 3. PrepareClient: The client disconnected, so we can't tell them anyway... +pub enum ClientRequestError { + #[error("{0}")] + Cancellation(#[from] cancellation::CancelError), + #[error("{0}")] + Handshake(#[from] handshake::HandshakeError), + #[error("{0}")] + HandshakeTimeout(#[from] tokio::time::error::Elapsed), + #[error("{0}")] + PrepareClient(#[from] std::io::Error), + #[error("{0}")] + ReportedError(#[from] crate::stream::ReportedError), } -/// Establish a (most probably, secure) connection with the client. -/// For better testing experience, `stream` can be any object satisfying the traits. -/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; -/// we also take an extra care of propagating only the select handshake errors to client. -#[tracing::instrument(skip_all)] -async fn handshake( - stream: S, - mut tls: Option<&TlsConfig>, - cancel_map: &CancelMap, -) -> anyhow::Result>, StartupMessageParams)>> { - // Client may try upgrading to each protocol only once - let (mut tried_ssl, mut tried_gss) = (false, false); - - let mut stream = PqStream::new(Stream::from_raw(stream)); - loop { - let msg = stream.read_startup_packet().await?; - info!("received {msg:?}"); - - use FeStartupPacket::*; - match msg { - SslRequest => match stream.get_ref() { - Stream::Raw { .. } if !tried_ssl => { - tried_ssl = true; - - // We can't perform TLS handshake without a config - let enc = tls.is_some(); - stream.write_message(&Be::EncryptionResponse(enc)).await?; - if let Some(tls) = tls.take() { - // Upgrade raw stream into a secure TLS-backed stream. - // NOTE: We've consumed `tls`; this fact will be used later. - - let (raw, read_buf) = stream.into_inner(); - // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empy. - // However, you could imagine pipelining of postgres - // SSLRequest + TLS ClientHello in one hunk similar to - // pipelining in our node js driver. We should probably - // support that by chaining read_buf with the stream. - if !read_buf.is_empty() { - bail!("data is sent before server replied with EncryptionResponse"); - } - let tls_stream = raw.upgrade(tls.to_server_config()).await?; - - let (_, tls_server_end_point) = tls - .cert_resolver - .resolve(tls_stream.get_ref().1.server_name()) - .context("missing certificate")?; - - stream = PqStream::new(Stream::Tls { - tls: Box::new(tls_stream), - tls_server_end_point, - }); - } - } - _ => bail!(ERR_PROTO_VIOLATION), - }, - GssEncRequest => match stream.get_ref() { - Stream::Raw { .. } if !tried_gss => { - tried_gss = true; - - // Currently, we don't support GSSAPI - stream.write_message(&Be::EncryptionResponse(false)).await?; - } - _ => bail!(ERR_PROTO_VIOLATION), - }, - StartupMessage { params, .. } => { - // Check that the config has been consumed during upgrade - // OR we didn't provide it at all (for dev purposes). - if tls.is_some() { - stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; - } - - info!(session_type = "normal", "successful handshake"); - break Ok(Some((stream, params))); - } - CancelRequest(cancel_key_data) => { - cancel_map.cancel_session(cancel_key_data).await?; - - info!(session_type = "cancellation", "successful handshake"); - break Ok(None); - } +impl ReportableError for ClientRequestError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ClientRequestError::Cancellation(e) => e.get_error_kind(), + ClientRequestError::Handshake(e) => e.get_error_kind(), + ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit, + ClientRequestError::ReportedError(e) => e.get_error_kind(), + ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect, } } } +pub async fn handle_client( + config: &'static ProxyConfig, + ctx: &mut RequestMonitoring, + cancellation_handler: Arc, + stream: S, + mode: ClientMode, + endpoint_rate_limiter: Arc, + conn_gauge: NumClientConnectionsGuard<'static>, +) -> Result>, ClientRequestError> { + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); + + let metrics = &Metrics::get().proxy; + let proto = ctx.protocol; + let _request_gauge = metrics.connection_requests.guard(proto); + + let tls = config.tls_config.as_ref(); + + let record_handshake_error = !ctx.has_private_peer_addr(); + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error); + let (mut stream, params) = + match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(cancel_key_data) => { + return Ok(cancellation_handler + .cancel_session(cancel_key_data, ctx.session_id) + .await + .map(|()| None)?) + } + }; + drop(pause); + + ctx.set_db_options(params.clone()); + + let hostname = mode.hostname(stream.get_ref()); + + let common_names = tls.map(|tls| &tls.common_names); + + // Extract credentials which we're going to use for auth. + let result = config + .auth_backend + .as_ref() + .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names)) + .transpose(); + + let user_info = match result { + Ok(user_info) => user_info, + Err(e) => stream.throw_error(e).await?, + }; + + let user = user_info.get_user().to_owned(); + let user_info = match user_info + .authenticate( + ctx, + &mut stream, + mode.allow_cleartext(), + &config.authentication_config, + endpoint_rate_limiter, + ) + .await + { + Ok(auth_result) => auth_result, + Err(e) => { + let db = params.get("database"); + let app = params.get("application_name"); + let params_span = tracing::info_span!("", ?user, ?db, ?app); + + return stream.throw_error(e).instrument(params_span).await?; + } + }; + + let mut node = connect_to_compute( + ctx, + &TcpMechanism { + params: ¶ms, + locks: &config.connect_compute_locks, + }, + &user_info, + mode.allow_self_signed_compute(config), + config.wake_compute_retry_config, + config.connect_to_compute_retry_config, + ) + .or_else(|e| stream.throw_error(e)) + .await?; + + let session = cancellation_handler.get_session(); + prepare_client_connection(&node, &session, &mut stream).await?; + + // Before proxy passing, forward to compute whatever data is left in the + // PqStream input buffer. Normally there is none, but our serverless npm + // driver in pipeline mode sends startup, password and first query + // immediately after opening the connection. + let (stream, read_buf) = stream.into_inner(); + node.stream.write_all(&read_buf).await?; + + Ok(Some(ProxyPassthrough { + client: stream, + aux: node.aux.clone(), + compute: node, + req: _request_gauge, + conn: conn_gauge, + cancel: session, + })) +} + /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -async fn prepare_client_connection( +async fn prepare_client_connection

( node: &compute::PostgresConnection, - session: cancellation::Session<'_>, + session: &cancellation::Session

, stream: &mut PqStream, -) -> anyhow::Result<()> { +) -> Result<(), std::io::Error> { // Register compute's query cancellation token and produce a new, unique one. // The new token (cancel_key_data) will be sent to the client. let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone()); @@ -345,177 +372,58 @@ async fn prepare_client_connection( Ok(()) } -/// Forward bytes in both directions (client <-> compute). -#[tracing::instrument(skip_all)] -pub async fn proxy_pass( - ctx: &mut RequestMonitoring, - client: impl AsyncRead + AsyncWrite + Unpin, - compute: impl AsyncRead + AsyncWrite + Unpin, - aux: MetricsAuxInfo, -) -> anyhow::Result<()> { - ctx.log(); +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct NeonOptions(Vec<(SmolStr, SmolStr)>); - let usage = USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), - }); +impl NeonOptions { + pub fn parse_params(params: &StartupMessageParams) -> Self { + params + .options_raw() + .map(Self::parse_from_iter) + .unwrap_or_default() + } + pub fn parse_options_raw(options: &str) -> Self { + Self::parse_from_iter(StartupMessageParams::parse_options_raw(options)) + } - let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); - let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx")); - let mut client = MeasuredStream::new( - client, - |_| {}, - |cnt| { - // Number of bytes we sent to the client (outbound). - m_sent.inc_by(cnt as u64); - m_sent2.inc_by(cnt as u64); - usage.record_egress(cnt as u64); - }, - ); + pub fn is_ephemeral(&self) -> bool { + // Currently, neon endpoint options are all reserved for ephemeral endpoints. + !self.0.is_empty() + } - let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]); - let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx")); - let mut compute = MeasuredStream::new( - compute, - |_| {}, - |cnt| { - // Number of bytes the client sent to the compute node (inbound). - m_recv.inc_by(cnt as u64); - m_recv2.inc_by(cnt as u64); - }, - ); + fn parse_from_iter<'a>(options: impl Iterator) -> Self { + let mut options = options + .filter_map(neon_option) + .map(|(k, v)| (k.into(), v.into())) + .collect_vec(); + options.sort(); + Self(options) + } - // Starting from here we only proxy the client's traffic. - info!("performing the proxy pass..."); - let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?; + pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey { + // prefix + format!(" {k}:{v}") + // kinda jank because SmolStr is immutable + std::iter::once(prefix) + .chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v])) + .collect::() + .into() + } - Ok(()) -} - -/// Thin connection context. -struct Client<'a, S> { - /// The underlying libpq protocol stream. - stream: PqStream>, - /// Client credentials that we care about. - creds: auth::BackendType<'a, auth::ClientCredentials>, - /// KV-dictionary with PostgreSQL connection params. - params: &'a StartupMessageParams, - /// Allow self-signed certificates (for testing). - allow_self_signed_compute: bool, - /// Rate limiter for endpoints - endpoint_rate_limiter: Arc, -} - -impl<'a, S> Client<'a, S> { - /// Construct a new connection context. - fn new( - stream: PqStream>, - creds: auth::BackendType<'a, auth::ClientCredentials>, - params: &'a StartupMessageParams, - allow_self_signed_compute: bool, - endpoint_rate_limiter: Arc, - ) -> Self { - Self { - stream, - creds, - params, - allow_self_signed_compute, - endpoint_rate_limiter, - } + /// DeepObject format + /// `paramName[prop1]=value1¶mName[prop2]=value2&...` + pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> { + self.0 + .iter() + .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone())) + .collect() } } -impl Client<'_, S> { - /// Let the client authenticate and connect to the designated compute node. - // Instrumentation logs endpoint name everywhere. Doesn't work for link - // auth; strictly speaking we don't know endpoint name in its case. - #[tracing::instrument(name = "", fields(ep = %self.creds.get_endpoint().unwrap_or_default()), skip_all)] - async fn connect_to_db( - self, - ctx: &mut RequestMonitoring, - session: cancellation::Session<'_>, - mode: ClientMode, - config: &'static AuthenticationConfig, - ) -> anyhow::Result<()> { - let Self { - mut stream, - creds, - params, - allow_self_signed_compute, - endpoint_rate_limiter, - } = self; - - // check rate limit - if let Some(ep) = creds.get_endpoint() { - if !endpoint_rate_limiter.check(ep) { - return stream - .throw_error(auth::AuthError::too_many_connections()) - .await; - } - } - - let extra = console::ConsoleReqExtra { - options: neon_options(params), - }; - - let user = creds.get_user().to_owned(); - let auth_result = match creds - .authenticate(ctx, &extra, &mut stream, mode.allow_cleartext(), config) - .await - { - Ok(auth_result) => auth_result, - Err(e) => { - let db = params.get("database"); - let app = params.get("application_name"); - let params_span = tracing::info_span!("", ?user, ?db, ?app); - - return stream.throw_error(e).instrument(params_span).await; - } - }; - - let (mut node_info, creds) = auth_result; - - node_info.allow_self_signed_compute = allow_self_signed_compute; - - let aux = node_info.aux.clone(); - let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &extra, &creds) - .or_else(|e| stream.throw_error(e)) - .await?; - - prepare_client_connection(&node, session, &mut stream).await?; - // Before proxy passing, forward to compute whatever data is left in the - // PqStream input buffer. Normally there is none, but our serverless npm - // driver in pipeline mode sends startup, password and first query - // immediately after opening the connection. - let (stream, read_buf) = stream.into_inner(); - node.stream.write_all(&read_buf).await?; - proxy_pass(ctx, stream, node.stream, aux).await - } -} - -pub fn neon_options(params: &StartupMessageParams) -> Vec<(String, String)> { - #[allow(unstable_name_collisions)] - match params.options_raw() { - Some(options) => options.filter_map(neon_option).collect(), - None => vec![], - } -} - -pub fn neon_options_str(params: &StartupMessageParams) -> String { - #[allow(unstable_name_collisions)] - neon_options(params) - .iter() - .map(|(k, v)| format!("{}:{}", k, v)) - .sorted() // we sort it to use as cache key - .intersperse(" ".to_owned()) - .collect() -} - -pub fn neon_option(bytes: &str) -> Option<(String, String)> { +pub fn neon_option(bytes: &str) -> Option<(&str, &str)> { static RE: OnceCell = OnceCell::new(); let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap()); let cap = re.captures(bytes)?; let (_, [k, v]) = cap.extract(); - Some((k.to_owned(), v.to_owned())) + Some((k, v)) } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 1da2dee10b..409d45b39a 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,15 +1,19 @@ use crate::{ - auth, + auth::backend::ComputeCredentialKeys, compute::{self, PostgresConnection}, - console::{self, errors::WakeComputeError, Api}, + config::RetryConfig, + console::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo}, context::RequestMonitoring, - metrics::{bool_to_str, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES}, - proxy::retry::{retry_after, ShouldRetry}, + error::ReportableError, + metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType}, + proxy::{ + retry::{retry_after, ShouldRetry}, + wake_compute::wake_compute, + }, + Host, }; use async_trait::async_trait; -use hyper::StatusCode; use pq_proto::StartupMessageParams; -use std::ops::ControlFlow; use tokio::time; use tracing::{error, info, warn}; @@ -19,39 +23,24 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); /// (e.g. the compute node's address might've changed at the wrong time). /// Invalidate the cache entry (if any) to prevent subsequent errors. #[tracing::instrument(name = "invalidate_cache", skip_all)] -pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg { +pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo { let is_cached = node_info.cached(); if is_cached { warn!("invalidating stalled compute node info cache entry"); } let label = match is_cached { - true => "compute_cached", - false => "compute_uncached", + true => ConnectionFailureKind::ComputeCached, + false => ConnectionFailureKind::ComputeUncached, }; - NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + Metrics::get().proxy.connection_failures_total.inc(label); - node_info.invalidate().config -} - -/// Try to connect to the compute node once. -#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)] -async fn connect_to_compute_once( - ctx: &mut RequestMonitoring, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, -) -> Result { - let allow_self_signed_compute = node_info.allow_self_signed_compute; - - node_info - .config - .connect(ctx, allow_self_signed_compute, timeout) - .await + node_info.invalidate() } #[async_trait] pub trait ConnectMechanism { type Connection; - type ConnectError; + type ConnectError: ReportableError; type Error: From; async fn connect_once( &self, @@ -63,9 +52,22 @@ pub trait ConnectMechanism { fn update_connect_config(&self, conf: &mut compute::ConnCfg); } +#[async_trait] +pub trait ComputeConnectBackend { + async fn wake_compute( + &self, + ctx: &mut RequestMonitoring, + ) -> Result; + + fn get_keys(&self) -> Option<&ComputeCredentialKeys>; +} + pub struct TcpMechanism<'a> { /// KV-dictionary with PostgreSQL connection params. pub params: &'a StartupMessageParams, + + /// connect_to_compute concurrency lock + pub locks: &'static ApiLocks, } #[async_trait] @@ -74,13 +76,16 @@ impl ConnectMechanism for TcpMechanism<'_> { type ConnectError = compute::ConnectionError; type Error = compute::ConnectionError; + #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] async fn connect_once( &self, ctx: &mut RequestMonitoring, node_info: &console::CachedNodeInfo, timeout: time::Duration, ) -> Result { - connect_to_compute_once(ctx, node_info, timeout).await + let host = node_info.config.get_host()?; + let permit = self.locks.get_permit(&host).await?; + permit.release_result(node_info.connect(ctx, timeout).await) } fn update_connect_config(&self, config: &mut compute::ConnCfg) { @@ -88,117 +93,83 @@ impl ConnectMechanism for TcpMechanism<'_> { } } -fn report_error(e: &WakeComputeError, retry: bool) { - use crate::console::errors::ApiError; - let retry = bool_to_str(retry); - let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - ref text, - }) if text.contains("written data quota exceeded") - || text.contains("the limit for current plan reached") => - { - "quota_exceeded" - } - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - .. - }) => "api_console_locked", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::BAD_REQUEST, - .. - }) => "api_console_bad_request", - WakeComputeError::ApiError(ApiError::Console { status, .. }) - if status.is_server_error() => - { - "api_console_other_server_error" - } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", - }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); -} - /// Try to connect to the compute node, retrying if necessary. -/// This function might update `node_info`, so we take it by `&mut`. #[tracing::instrument(skip_all)] -pub async fn connect_to_compute( +pub async fn connect_to_compute( ctx: &mut RequestMonitoring, mechanism: &M, - mut node_info: console::CachedNodeInfo, - extra: &console::ConsoleReqExtra, - creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, + user_info: &B, + allow_self_signed_compute: bool, + wake_compute_retry_config: RetryConfig, + connect_to_compute_retry_config: RetryConfig, ) -> Result where M::ConnectError: ShouldRetry + std::fmt::Debug, M::Error: From, { + let mut num_retries = 0; + let mut node_info = + wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; + if let Some(keys) = user_info.get_keys() { + node_info.set_keys(keys); + } + node_info.allow_self_signed_compute = allow_self_signed_compute; + // let mut node_info = credentials.get_node_info(ctx, user_info).await?; mechanism.update_connect_config(&mut node_info.config); + let retry_type = RetryType::ConnectToCompute; // try once - let (config, err) = match mechanism + let err = match mechanism .connect_once(ctx, &node_info, CONNECT_TIMEOUT) .await { Ok(res) => { ctx.latency_timer.success(); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + num_retries.into(), + ); return Ok(res); } - Err(e) => { - error!(error = ?e, "could not connect to compute node"); - (invalidate_cache(node_info), e) - } + Err(e) => e, }; - ctx.latency_timer.cache_miss(); + error!(error = ?err, "could not connect to compute node"); - let mut num_retries = 1; - - // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node - info!("compute node's state has likely changed; requesting a wake-up"); - let node_info = loop { - let wake_res = match creds { - auth::BackendType::Console(api, creds) => api.wake_compute(ctx, extra, creds).await, - #[cfg(feature = "testing")] - auth::BackendType::Postgres(api, creds) => api.wake_compute(ctx, extra, creds).await, - // nothing to do? - auth::BackendType::Link(_) => return Err(err.into()), - // test backend - #[cfg(test)] - auth::BackendType::Test(x) => x.wake_compute(), - }; - - match handle_try_wake(wake_res, num_retries) { - Err(e) => { - error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); - report_error(&e, false); - return Err(e.into()); - } - // failed to wake up but we can continue to retry - Ok(ControlFlow::Continue(e)) => { - report_error(&e, true); - warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); - } - // successfully woke up a compute node and can break the wakeup loop - Ok(ControlFlow::Break(mut node_info)) => { - node_info.config.reuse_password(&config); - mechanism.update_connect_config(&mut node_info.config); - break node_info; - } + let node_info = if !node_info.cached() || !err.should_retry_database_address() { + // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry. + // Do not need to retrieve a new node_info, just return the old one. + if !err.should_retry(num_retries, connect_to_compute_retry_config) { + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + num_retries.into(), + ); + return Err(err.into()); } + node_info + } else { + // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node + info!("compute node's state has likely changed; requesting a wake-up"); + let old_node_info = invalidate_cache(node_info); + let mut node_info = + wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?; + node_info.reuse_settings(old_node_info); - let wait_duration = retry_after(num_retries); - num_retries += 1; - - time::sleep(wait_duration).await; + mechanism.update_connect_config(&mut node_info.config); + node_info }; // now that we have a new node, try connect to it repeatedly. // this can error for a few reasons, for instance: // * DNS connection settings haven't quite propagated yet info!("wake_compute success. attempting to connect"); + num_retries = 1; loop { match mechanism .connect_once(ctx, &node_info, CONNECT_TIMEOUT) @@ -206,41 +177,40 @@ where { Ok(res) => { ctx.latency_timer.success(); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + num_retries.into(), + ); + info!(?num_retries, "connected to compute node after"); return Ok(res); } Err(e) => { - let retriable = e.should_retry(num_retries); + let retriable = e.should_retry(num_retries, connect_to_compute_retry_config); if !retriable { error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + num_retries.into(), + ); return Err(e.into()); } warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); } } - let wait_duration = retry_after(num_retries); + let wait_duration = retry_after(num_retries, connect_to_compute_retry_config); num_retries += 1; + let pause = ctx + .latency_timer + .pause(crate::metrics::Waiting::RetryTimeout); time::sleep(wait_duration).await; - } -} - -/// Attempts to wake up the compute node. -/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable -/// * Returns Ok(Break(node)) if the wakeup succeeded -/// * Returns Err(e) if there was an error -pub fn handle_try_wake( - result: Result, - num_retries: u32, -) -> Result, WakeComputeError> { - match result { - Err(err) => match &err { - WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { - Ok(ControlFlow::Continue(err)) - } - _ => Err(err), - }, - // Ready to try again. - Ok(new) => Ok(ControlFlow::Break(new)), + drop(pause); } } diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs new file mode 100644 index 0000000000..aaf3688f21 --- /dev/null +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -0,0 +1,274 @@ +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tracing::info; + +use std::future::poll_fn; +use std::io; +use std::pin::Pin; +use std::task::{ready, Context, Poll}; + +#[derive(Debug)] +enum TransferState { + Running(CopyBuffer), + ShuttingDown(u64), + Done(u64), +} + +fn transfer_one_direction( + cx: &mut Context<'_>, + state: &mut TransferState, + r: &mut A, + w: &mut B, +) -> Poll> +where + A: AsyncRead + AsyncWrite + Unpin + ?Sized, + B: AsyncRead + AsyncWrite + Unpin + ?Sized, +{ + let mut r = Pin::new(r); + let mut w = Pin::new(w); + loop { + match state { + TransferState::Running(buf) => { + let count = ready!(buf.poll_copy(cx, r.as_mut(), w.as_mut()))?; + *state = TransferState::ShuttingDown(count); + } + TransferState::ShuttingDown(count) => { + ready!(w.as_mut().poll_shutdown(cx))?; + *state = TransferState::Done(*count); + } + TransferState::Done(count) => return Poll::Ready(Ok(*count)), + } + } +} + +#[tracing::instrument(skip_all)] +pub async fn copy_bidirectional_client_compute( + client: &mut Client, + compute: &mut Compute, +) -> Result<(u64, u64), std::io::Error> +where + Client: AsyncRead + AsyncWrite + Unpin + ?Sized, + Compute: AsyncRead + AsyncWrite + Unpin + ?Sized, +{ + let mut client_to_compute = TransferState::Running(CopyBuffer::new()); + let mut compute_to_client = TransferState::Running(CopyBuffer::new()); + + poll_fn(|cx| { + let mut client_to_compute_result = + transfer_one_direction(cx, &mut client_to_compute, client, compute)?; + let mut compute_to_client_result = + transfer_one_direction(cx, &mut compute_to_client, compute, client)?; + + // Early termination checks from compute to client. + if let TransferState::Done(_) = compute_to_client { + if let TransferState::Running(buf) = &client_to_compute { + info!("Compute is done, terminate client"); + // Initiate shutdown + client_to_compute = TransferState::ShuttingDown(buf.amt); + client_to_compute_result = + transfer_one_direction(cx, &mut client_to_compute, client, compute)?; + } + } + + // Early termination checks from compute to client. + if let TransferState::Done(_) = client_to_compute { + if let TransferState::Running(buf) = &compute_to_client { + info!("Client is done, terminate compute"); + // Initiate shutdown + compute_to_client = TransferState::ShuttingDown(buf.amt); + compute_to_client_result = + transfer_one_direction(cx, &mut compute_to_client, client, compute)?; + } + } + + // It is not a problem if ready! returns early ... (comment remains the same) + let client_to_compute = ready!(client_to_compute_result); + let compute_to_client = ready!(compute_to_client_result); + + Poll::Ready(Ok((client_to_compute, compute_to_client))) + }) + .await +} + +#[derive(Debug)] +pub(super) struct CopyBuffer { + read_done: bool, + need_flush: bool, + pos: usize, + cap: usize, + amt: u64, + buf: Box<[u8]>, +} +const DEFAULT_BUF_SIZE: usize = 1024; + +impl CopyBuffer { + pub(super) fn new() -> Self { + Self { + read_done: false, + need_flush: false, + pos: 0, + cap: 0, + amt: 0, + buf: vec![0; DEFAULT_BUF_SIZE].into_boxed_slice(), + } + } + + fn poll_fill_buf( + &mut self, + cx: &mut Context<'_>, + reader: Pin<&mut R>, + ) -> Poll> + where + R: AsyncRead + ?Sized, + { + let me = &mut *self; + let mut buf = ReadBuf::new(&mut me.buf); + buf.set_filled(me.cap); + + let res = reader.poll_read(cx, &mut buf); + if let Poll::Ready(Ok(())) = res { + let filled_len = buf.filled().len(); + me.read_done = me.cap == filled_len; + me.cap = filled_len; + } + res + } + + fn poll_write_buf( + &mut self, + cx: &mut Context<'_>, + mut reader: Pin<&mut R>, + mut writer: Pin<&mut W>, + ) -> Poll> + where + R: AsyncRead + ?Sized, + W: AsyncWrite + ?Sized, + { + let me = &mut *self; + match writer.as_mut().poll_write(cx, &me.buf[me.pos..me.cap]) { + Poll::Pending => { + // Top up the buffer towards full if we can read a bit more + // data - this should improve the chances of a large write + if !me.read_done && me.cap < me.buf.len() { + ready!(me.poll_fill_buf(cx, reader.as_mut()))?; + } + Poll::Pending + } + res => res, + } + } + + pub(super) fn poll_copy( + &mut self, + cx: &mut Context<'_>, + mut reader: Pin<&mut R>, + mut writer: Pin<&mut W>, + ) -> Poll> + where + R: AsyncRead + ?Sized, + W: AsyncWrite + ?Sized, + { + loop { + // If our buffer is empty, then we need to read some data to + // continue. + if self.pos == self.cap && !self.read_done { + self.pos = 0; + self.cap = 0; + + match self.poll_fill_buf(cx, reader.as_mut()) { + Poll::Ready(Ok(())) => (), + Poll::Ready(Err(err)) => return Poll::Ready(Err(err)), + Poll::Pending => { + // Try flushing when the reader has no progress to avoid deadlock + // when the reader depends on buffered writer. + if self.need_flush { + ready!(writer.as_mut().poll_flush(cx))?; + self.need_flush = false; + } + + return Poll::Pending; + } + } + } + + // If our buffer has some data, let's write it out! + while self.pos < self.cap { + let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?; + if i == 0 { + return Poll::Ready(Err(io::Error::new( + io::ErrorKind::WriteZero, + "write zero byte into writer", + ))); + } else { + self.pos += i; + self.amt += i as u64; + self.need_flush = true; + } + } + + // If pos larger than cap, this loop will never stop. + // In particular, user's wrong poll_write implementation returning + // incorrect written length may lead to thread blocking. + debug_assert!( + self.pos <= self.cap, + "writer returned length larger than input slice" + ); + + // If we've written all the data and we've seen EOF, flush out the + // data and finish the transfer. + if self.pos == self.cap && self.read_done { + ready!(writer.as_mut().poll_flush(cx))?; + return Poll::Ready(Ok(self.amt)); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::io::AsyncWriteExt; + + #[tokio::test] + async fn test_client_to_compute() { + let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream + let (mut compute_proxy, mut compute_client) = tokio::io::duplex(32); // Create a mock duplex stream + + // Simulate 'a' finishing while there's still data for 'b' + client_client.write_all(b"hello").await.unwrap(); + client_client.shutdown().await.unwrap(); + compute_client.write_all(b"Neon").await.unwrap(); + compute_client.shutdown().await.unwrap(); + + let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy) + .await + .unwrap(); + + // Assert correct transferred amounts + let (client_to_compute_count, compute_to_client_count) = result; + assert_eq!(client_to_compute_count, 5); // 'hello' was transferred + assert_eq!(compute_to_client_count, 4); // response only partially transferred or not at all + } + + #[tokio::test] + async fn test_compute_to_client() { + let (mut client_client, mut client_proxy) = tokio::io::duplex(32); // Create a mock duplex stream + let (mut compute_proxy, mut compute_client) = tokio::io::duplex(8); // Create a mock duplex stream + + // Simulate 'a' finishing while there's still data for 'b' + compute_client.write_all(b"hello").await.unwrap(); + compute_client.shutdown().await.unwrap(); + client_client + .write_all(b"Neon Serverless Postgres") + .await + .unwrap(); + + let result = copy_bidirectional_client_compute(&mut client_proxy, &mut compute_proxy) + .await + .unwrap(); + + // Assert correct transferred amounts + let (client_to_compute_count, compute_to_client_count) = result; + assert_eq!(compute_to_client_count, 5); // 'hello' was transferred + assert!(client_to_compute_count <= 8); // response only partially transferred or not at all + } +} diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs new file mode 100644 index 0000000000..dd935cc245 --- /dev/null +++ b/proxy/src/proxy/handshake.rs @@ -0,0 +1,143 @@ +use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams}; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::info; + +use crate::{ + config::TlsConfig, + error::ReportableError, + proxy::ERR_INSECURE_CONNECTION, + stream::{PqStream, Stream, StreamUpgradeError}, +}; + +#[derive(Error, Debug)] +pub enum HandshakeError { + #[error("data is sent before server replied with EncryptionResponse")] + EarlyData, + + #[error("protocol violation")] + ProtocolViolation, + + #[error("missing certificate")] + MissingCertificate, + + #[error("{0}")] + StreamUpgradeError(#[from] StreamUpgradeError), + + #[error("{0}")] + Io(#[from] std::io::Error), + + #[error("{0}")] + ReportedError(#[from] crate::stream::ReportedError), +} + +impl ReportableError for HandshakeError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + HandshakeError::EarlyData => crate::error::ErrorKind::User, + HandshakeError::ProtocolViolation => crate::error::ErrorKind::User, + // This error should not happen, but will if we have no default certificate and + // the client sends no SNI extension. + // If they provide SNI then we can be sure there is a certificate that matches. + HandshakeError::MissingCertificate => crate::error::ErrorKind::Service, + HandshakeError::StreamUpgradeError(upgrade) => match upgrade { + StreamUpgradeError::AlreadyTls => crate::error::ErrorKind::Service, + StreamUpgradeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, + }, + HandshakeError::Io(_) => crate::error::ErrorKind::ClientDisconnect, + HandshakeError::ReportedError(e) => e.get_error_kind(), + } + } +} + +pub enum HandshakeData { + Startup(PqStream>, StartupMessageParams), + Cancel(CancelKeyData), +} + +/// Establish a (most probably, secure) connection with the client. +/// For better testing experience, `stream` can be any object satisfying the traits. +/// It's easier to work with owned `stream` here as we need to upgrade it to TLS; +/// we also take an extra care of propagating only the select handshake errors to client. +#[tracing::instrument(skip_all)] +pub async fn handshake( + stream: S, + mut tls: Option<&TlsConfig>, + record_handshake_error: bool, +) -> Result, HandshakeError> { + // Client may try upgrading to each protocol only once + let (mut tried_ssl, mut tried_gss) = (false, false); + + let mut stream = PqStream::new(Stream::from_raw(stream)); + loop { + let msg = stream.read_startup_packet().await?; + info!("received {msg:?}"); + + use FeStartupPacket::*; + match msg { + SslRequest => match stream.get_ref() { + Stream::Raw { .. } if !tried_ssl => { + tried_ssl = true; + + // We can't perform TLS handshake without a config + let enc = tls.is_some(); + stream.write_message(&Be::EncryptionResponse(enc)).await?; + if let Some(tls) = tls.take() { + // Upgrade raw stream into a secure TLS-backed stream. + // NOTE: We've consumed `tls`; this fact will be used later. + + let (raw, read_buf) = stream.into_inner(); + // TODO: Normally, client doesn't send any data before + // server says TLS handshake is ok and read_buf is empy. + // However, you could imagine pipelining of postgres + // SSLRequest + TLS ClientHello in one hunk similar to + // pipelining in our node js driver. We should probably + // support that by chaining read_buf with the stream. + if !read_buf.is_empty() { + return Err(HandshakeError::EarlyData); + } + let tls_stream = raw + .upgrade(tls.to_server_config(), record_handshake_error) + .await?; + + let (_, tls_server_end_point) = tls + .cert_resolver + .resolve(tls_stream.get_ref().1.server_name()) + .ok_or(HandshakeError::MissingCertificate)?; + + stream = PqStream::new(Stream::Tls { + tls: Box::new(tls_stream), + tls_server_end_point, + }); + } + } + _ => return Err(HandshakeError::ProtocolViolation), + }, + GssEncRequest => match stream.get_ref() { + Stream::Raw { .. } if !tried_gss => { + tried_gss = true; + + // Currently, we don't support GSSAPI + stream.write_message(&Be::EncryptionResponse(false)).await?; + } + _ => return Err(HandshakeError::ProtocolViolation), + }, + StartupMessage { params, .. } => { + // Check that the config has been consumed during upgrade + // OR we didn't provide it at all (for dev purposes). + if tls.is_some() { + return stream + .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User) + .await?; + } + + info!(session_type = "normal", "successful handshake"); + break Ok(HandshakeData::Startup(stream, params)); + } + CancelRequest(cancel_key_data) => { + info!(session_type = "cancellation", "successful handshake"); + break Ok(HandshakeData::Cancel(cancel_key_data)); + } + } + } +} diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs new file mode 100644 index 0000000000..62de79946f --- /dev/null +++ b/proxy/src/proxy/passthrough.rs @@ -0,0 +1,74 @@ +use crate::{ + cancellation, + compute::PostgresConnection, + console::messages::MetricsAuxInfo, + metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, + stream::Stream, + usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, +}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::info; +use utils::measured_stream::MeasuredStream; + +/// Forward bytes in both directions (client <-> compute). +#[tracing::instrument(skip_all)] +pub async fn proxy_pass( + client: impl AsyncRead + AsyncWrite + Unpin, + compute: impl AsyncRead + AsyncWrite + Unpin, + aux: MetricsAuxInfo, +) -> anyhow::Result<()> { + let usage = USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, + }); + + let metrics = &Metrics::get().proxy.io_bytes; + let m_sent = metrics.with_labels(Direction::Tx); + let mut client = MeasuredStream::new( + client, + |_| {}, + |cnt| { + // Number of bytes we sent to the client (outbound). + metrics.get_metric(m_sent).inc_by(cnt as u64); + usage.record_egress(cnt as u64); + }, + ); + + let m_recv = metrics.with_labels(Direction::Rx); + let mut compute = MeasuredStream::new( + compute, + |_| {}, + |cnt| { + // Number of bytes the client sent to the compute node (inbound). + metrics.get_metric(m_recv).inc_by(cnt as u64); + }, + ); + + // Starting from here we only proxy the client's traffic. + info!("performing the proxy pass..."); + let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute( + &mut client, + &mut compute, + ) + .await?; + + Ok(()) +} + +pub struct ProxyPassthrough { + pub client: Stream, + pub compute: PostgresConnection, + pub aux: MetricsAuxInfo, + + pub req: NumConnectionRequestsGuard<'static>, + pub conn: NumClientConnectionsGuard<'static>, + pub cancel: cancellation::Session

, +} + +impl ProxyPassthrough { + pub async fn proxy_pass(self) -> anyhow::Result<()> { + let res = proxy_pass(self.client, self.compute.stream, self.aux).await; + self.compute.cancel_closure.try_cancel_query().await?; + res + } +} diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index a85ed380b0..8dec1f1137 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -1,21 +1,18 @@ -use crate::compute; +use crate::{compute, config::RetryConfig}; use std::{error::Error, io}; use tokio::time; -/// Number of times we should retry the `/proxy_wake_compute` http request. -/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0 -pub const NUM_RETRIES_CONNECT: u32 = 16; -const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25); -const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; - pub trait ShouldRetry { fn could_retry(&self) -> bool; - fn should_retry(&self, num_retries: u32) -> bool { + fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool { match self { - _ if num_retries >= NUM_RETRIES_CONNECT => false, + _ if num_retries >= config.max_retries => false, err => err.could_retry(), } } + fn should_retry_database_address(&self) -> bool { + true + } } impl ShouldRetry for io::Error { @@ -39,6 +36,21 @@ impl ShouldRetry for tokio_postgres::error::DbError { | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, ) } + fn should_retry_database_address(&self) -> bool { + use tokio_postgres::error::SqlState; + // Here are errors that happens after the user successfully authenticated to the database. + // TODO: there are pgbouncer errors that should be retried, but they are not listed here. + !matches!( + self.code(), + &SqlState::TOO_MANY_CONNECTIONS + | &SqlState::OUT_OF_MEMORY + | &SqlState::SYNTAX_ERROR + | &SqlState::T_R_SERIALIZATION_FAILURE + | &SqlState::INVALID_CATALOG_NAME + | &SqlState::INVALID_SCHEMA_NAME + | &SqlState::INVALID_PARAMETER_VALUE + ) + } } impl ShouldRetry for tokio_postgres::Error { @@ -51,6 +63,15 @@ impl ShouldRetry for tokio_postgres::Error { false } } + fn should_retry_database_address(&self) -> bool { + if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { + io::Error::should_retry_database_address(io_err) + } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { + tokio_postgres::error::DbError::should_retry_database_address(db_err) + } else { + true + } + } } impl ShouldRetry for compute::ConnectionError { @@ -61,8 +82,19 @@ impl ShouldRetry for compute::ConnectionError { _ => false, } } + fn should_retry_database_address(&self) -> bool { + match self { + compute::ConnectionError::Postgres(err) => err.should_retry_database_address(), + compute::ConnectionError::CouldNotConnect(err) => err.should_retry_database_address(), + // the cache entry was not checked for validity + compute::ConnectionError::TooManyConnectionAttempts(_) => false, + _ => true, + } + } } -pub fn retry_after(num_retries: u32) -> time::Duration { - BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1)) +pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration { + config + .base_delay + .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1)) } diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 0957f33a92..96683511fe 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -2,16 +2,26 @@ mod mitm; +use std::time::Duration; + use super::connect_compute::ConnectMechanism; use super::retry::ShouldRetry; use super::*; -use crate::auth::backend::{ComputeUserInfo, TestBackend}; -use crate::config::CertResolver; -use crate::console::{CachedNodeInfo, NodeInfo}; -use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; -use crate::{auth, http, sasl, scram}; +use crate::auth::backend::{ + ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend, +}; +use crate::config::{CertResolver, RetryConfig}; +use crate::console::caches::NodeInfoCache; +use crate::console::messages::{ConsoleError, MetricsAuxInfo}; +use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; +use crate::console::{self, CachedNodeInfo, NodeInfo}; +use crate::error::ErrorKind; +use crate::proxy::retry::retry_after; +use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId}; +use anyhow::{bail, Context}; use async_trait::async_trait; use rstest::rstest; +use rustls::pki_types; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; @@ -20,7 +30,11 @@ use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; fn generate_certs( hostname: &str, common_name: &str, -) -> anyhow::Result<(rustls::Certificate, rustls::Certificate, rustls::PrivateKey)> { +) -> anyhow::Result<( + pki_types::CertificateDer<'static>, + pki_types::CertificateDer<'static>, + pki_types::PrivateKeyDer<'static>, +)> { let ca = rcgen::Certificate::from_params({ let mut params = rcgen::CertificateParams::default(); params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained); @@ -37,9 +51,9 @@ fn generate_certs( })?; Ok(( - rustls::Certificate(ca.serialize_der()?), - rustls::Certificate(cert.serialize_der_with_signer(&ca)?), - rustls::PrivateKey(cert.serialize_private_key_der()), + pki_types::CertificateDer::from(ca.serialize_der()?), + pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?), + pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()), )) } @@ -74,15 +88,14 @@ fn generate_tls_config<'a>( let tls_config = { let config = rustls::ServerConfig::builder() - .with_safe_defaults() .with_no_client_auth() - .with_single_cert(vec![cert.clone()], key.clone())? + .with_single_cert(vec![cert.clone()], key.clone_key())? .into(); let mut cert_resolver = CertResolver::new(); cert_resolver.add_cert(key, vec![cert], true)?; - let common_names = Some(cert_resolver.get_common_names()); + let common_names = cert_resolver.get_common_names(); TlsConfig { config, @@ -93,10 +106,9 @@ fn generate_tls_config<'a>( let client_config = { let config = rustls::ClientConfig::builder() - .with_safe_defaults() .with_root_certificates({ let mut store = rustls::RootCertStore::empty(); - store.add(&ca)?; + store.add(ca)?; store }) .with_no_client_auth(); @@ -124,15 +136,15 @@ impl TestAuth for NoAuth {} struct Scram(scram::ServerSecret); impl Scram { - fn new(password: &str) -> anyhow::Result { - let salt = rand::random::<[u8; 16]>(); - let secret = scram::ServerSecret::build(password, &salt, 256) + async fn new(password: &str) -> anyhow::Result { + let secret = scram::ServerSecret::build(password) + .await .context("failed to generate scram secret")?; Ok(Scram(secret)) } - fn mock(user: &str) -> Self { - Scram(scram::ServerSecret::mock(user, rand::random())) + fn mock() -> Self { + Scram(scram::ServerSecret::mock(rand::random())) } } @@ -143,7 +155,7 @@ impl TestAuth for Scram { stream: &mut PqStream>, ) -> anyhow::Result<()> { let outcome = auth::AuthFlow::new(stream) - .begin(auth::Scram(&self.0)) + .begin(auth::Scram(&self.0, &mut RequestMonitoring::test())) .await? .authenticate() .await?; @@ -162,11 +174,11 @@ async fn dummy_proxy( tls: Option, auth: impl TestAuth + Send, ) -> anyhow::Result<()> { - let cancel_map = CancelMap::default(); - let client = WithClientIp::new(client); - let (mut stream, _params) = handshake(client, tls.as_ref(), &cancel_map) - .await? - .context("handshake failed")?; + let (client, _) = read_proxy_protocol(client).await?; + let mut stream = match handshake(client, tls.as_ref(), false).await? { + HandshakeData::Startup(stream, _) => stream, + HandshakeData::Cancel(_) => bail!("cancellation not supported"), + }; auth.authenticate(&mut stream).await?; @@ -274,7 +286,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new(password)?, + Scram::new(password).await?, )); let (_client, _conn) = tokio_postgres::Config::new() @@ -298,7 +310,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let (_client, _conn) = tokio_postgres::Config::new() @@ -319,11 +331,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> { let (client_config, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; - let proxy = tokio::spawn(dummy_proxy( - client, - Some(server_config), - Scram::mock("user"), - )); + let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); use rand::{distributions::Alphanumeric, Rng}; let password: String = rand::thread_rng() @@ -353,11 +361,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> { #[test] fn connect_compute_total_wait() { let mut total_wait = tokio::time::Duration::ZERO; - for num_retries in 1..NUM_RETRIES_CONNECT { - total_wait += retry_after(num_retries); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + for num_retries in 1..config.max_retries { + total_wait += retry_after(num_retries, config); } - assert!(total_wait < tokio::time::Duration::from_secs(12)); - assert!(total_wait > tokio::time::Duration::from_secs(10)); + assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1); } #[derive(Clone, Copy, Debug)] @@ -370,9 +382,11 @@ enum ConnectAction { Fail, } +#[derive(Clone)] struct TestConnectMechanism { counter: Arc>, sequence: Vec, + cache: &'static NodeInfoCache, } impl TestConnectMechanism { @@ -391,6 +405,12 @@ impl TestConnectMechanism { Self { counter: Arc::new(std::sync::Mutex::new(0)), sequence, + cache: Box::leak(Box::new(NodeInfoCache::new( + "test", + 1, + Duration::from_secs(100), + false, + ))), } } } @@ -401,6 +421,13 @@ struct TestConnection; #[derive(Debug)] struct TestConnectError { retryable: bool, + kind: crate::error::ErrorKind, +} + +impl ReportableError for TestConnectError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + self.kind + } } impl std::fmt::Display for TestConnectError { @@ -434,8 +461,14 @@ impl ConnectMechanism for TestConnectMechanism { *counter += 1; match action { ConnectAction::Connect => Ok(TestConnection), - ConnectAction::Retry => Err(TestConnectError { retryable: true }), - ConnectAction::Fail => Err(TestConnectError { retryable: false }), + ConnectAction::Retry => Err(TestConnectError { + retryable: true, + kind: ErrorKind::Compute, + }), + ConnectAction::Fail => Err(TestConnectError { + retryable: false, + kind: ErrorKind::Compute, + }), x => panic!("expecting action {:?}, connect is called instead", x), } } @@ -449,20 +482,22 @@ impl TestBackend for TestConnectMechanism { let action = self.sequence[*counter]; *counter += 1; match action { - ConnectAction::Wake => Ok(helper_create_cached_node_info()), + ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { - let err = console::errors::ApiError::Console { - status: http::StatusCode::FORBIDDEN, - text: "TEST".into(), - }; + let err = console::errors::ApiError::Console(ConsoleError { + http_status_code: http::StatusCode::FORBIDDEN, + error: "TEST".into(), + status: None, + }); assert!(!err.could_retry()); Err(console::errors::WakeComputeError::ApiError(err)) } ConnectAction::WakeRetry => { - let err = console::errors::ApiError::Console { - status: http::StatusCode::BAD_REQUEST, - text: "TEST".into(), - }; + let err = console::errors::ApiError::Console(ConsoleError { + http_status_code: http::StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: None, + }); assert!(err.could_retry()); Err(console::errors::WakeComputeError::ApiError(err)) } @@ -470,40 +505,62 @@ impl TestBackend for TestConnectMechanism { } } - fn get_allowed_ips(&self) -> Result>, console::errors::GetAuthInfoError> { + fn get_allowed_ips_and_secret( + &self, + ) -> Result<(CachedAllowedIps, Option), console::errors::GetAuthInfoError> + { + unimplemented!("not used in tests") + } + fn get_role_secret(&self) -> Result { unimplemented!("not used in tests") } } -fn helper_create_cached_node_info() -> CachedNodeInfo { +fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = NodeInfo { config: compute::ConnCfg::new(), - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; - CachedNodeInfo::new_uncached(node) + let (_, node) = cache.insert("key".into(), node); + node } fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> ( - CachedNodeInfo, - console::ConsoleReqExtra, - auth::BackendType<'_, ComputeUserInfo>, -) { - let cache = helper_create_cached_node_info(); - let extra = console::ConsoleReqExtra { options: vec![] }; - let creds = auth::BackendType::Test(mechanism); - (cache, extra, creds) +) -> auth::BackendType<'static, ComputeCredentials, &()> { + let user_info = auth::BackendType::Console( + MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))), + ComputeCredentials { + info: ComputeUserInfo { + endpoint: "endpoint".into(), + user: "user".into(), + options: NeonOptions::parse_options_raw(""), + }, + keys: ComputeCredentialKeys::Password("password".into()), + }, + ); + user_info } #[tokio::test] async fn connect_to_compute_success() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Connect]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds) + let mechanism = TestConnectMechanism::new(vec![Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -511,11 +568,17 @@ async fn connect_to_compute_success() { #[tokio::test] async fn connect_to_compute_retry() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds) + let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -524,11 +587,17 @@ async fn connect_to_compute_retry() { /// Test that we don't retry if the error is not retryable. #[tokio::test] async fn connect_to_compute_non_retry_1() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds) + let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); @@ -537,11 +606,17 @@ async fn connect_to_compute_non_retry_1() { /// Even for non-retryable errors, we should retry at least once. #[tokio::test] async fn connect_to_compute_non_retry_2() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds) + let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -550,28 +625,50 @@ async fn connect_to_compute_non_retry_2() { /// Retry for at most `NUM_RETRIES_CONNECT` times. #[tokio::test] async fn connect_to_compute_non_retry_3() { - assert_eq!(NUM_RETRIES_CONNECT, 16); + let _ = env_logger::try_init(); + tokio::time::pause(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![ - Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, - Retry, Retry, Retry, Retry, /* the 17th time */ Retry, - ]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds) - .await - .unwrap_err(); + let mechanism = + TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]); + let user_info = helper_create_connect_info(&mechanism); + let wake_compute_retry_config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 1, + backoff_factor: 2.0, + }; + let connect_to_compute_retry_config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute( + &mut ctx, + &mechanism, + &user_info, + false, + wake_compute_retry_config, + connect_to_compute_retry_config, + ) + .await + .unwrap_err(); mechanism.verify(); } /// Should retry wake compute. #[tokio::test] async fn wake_retry() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds) + let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap(); mechanism.verify(); @@ -580,11 +677,17 @@ async fn wake_retry() { /// Wake failed with a non-retryable error. #[tokio::test] async fn wake_non_retry() { + let _ = env_logger::try_init(); use ConnectAction::*; let mut ctx = RequestMonitoring::test(); - let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]); - let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds) + let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]); + let user_info = helper_create_connect_info(&mechanism); + let config = RetryConfig { + base_delay: Duration::from_secs(1), + max_retries: 5, + backoff_factor: 2.0, + }; + connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config) .await .unwrap_err(); mechanism.verify(); diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index a0a84a1dc0..cbfc9f1358 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -11,7 +11,6 @@ use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use postgres_protocol::message::frontend; use tokio::io::{AsyncReadExt, DuplexStream}; -use tokio_postgres::config::SslMode; use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; @@ -35,12 +34,13 @@ async fn proxy_mitm( tokio::spawn(async move { // begin handshake with end_server let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await; - // process handshake with end_client - let (end_client, startup) = - handshake(client1, Some(&server_config1), &CancelMap::default()) - .await - .unwrap() - .unwrap(); + let (end_client, startup) = match handshake(client1, Some(&server_config1), false) + .await + .unwrap() + { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(_) => panic!("cancellation not supported"), + }; let mut end_server = tokio_util::codec::Framed::new(end_server, PgFrame); let (end_client, buf) = end_client.framed.into_inner(); @@ -151,7 +151,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let _client_err = tokio_postgres::Config::new() @@ -234,7 +234,7 @@ async fn connect_failure( let proxy = tokio::spawn(dummy_proxy( client, Some(server_config), - Scram::new("password")?, + Scram::new("password").await?, )); let _client_err = tokio_postgres::Config::new() diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs new file mode 100644 index 0000000000..c166cf4389 --- /dev/null +++ b/proxy/src/proxy/wake_compute.rs @@ -0,0 +1,172 @@ +use crate::config::RetryConfig; +use crate::console::messages::ConsoleError; +use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo}; +use crate::context::RequestMonitoring; +use crate::metrics::{ + ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, + WakeupFailureKind, +}; +use crate::proxy::retry::retry_after; +use hyper1::StatusCode; +use std::ops::ControlFlow; +use tracing::{error, info, warn}; + +use super::connect_compute::ComputeConnectBackend; +use super::retry::ShouldRetry; + +pub async fn wake_compute( + num_retries: &mut u32, + ctx: &mut RequestMonitoring, + api: &B, + config: RetryConfig, +) -> Result { + let retry_type = RetryType::WakeCompute; + loop { + let wake_res = api.wake_compute(ctx).await; + match handle_try_wake(wake_res, *num_retries, config) { + Err(e) => { + error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + report_error(&e, false); + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Failed, + retry_type, + }, + (*num_retries).into(), + ); + return Err(e); + } + Ok(ControlFlow::Continue(e)) => { + warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + report_error(&e, true); + } + Ok(ControlFlow::Break(n)) => { + Metrics::get().proxy.retries_metric.observe( + RetriesMetricGroup { + outcome: ConnectOutcome::Success, + retry_type, + }, + (*num_retries).into(), + ); + info!(?num_retries, "compute node woken up after"); + return Ok(n); + } + } + + let wait_duration = retry_after(*num_retries, config); + *num_retries += 1; + let pause = ctx + .latency_timer + .pause(crate::metrics::Waiting::RetryTimeout); + tokio::time::sleep(wait_duration).await; + drop(pause); + } +} + +/// Attempts to wake up the compute node. +/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable +/// * Returns Ok(Break(node)) if the wakeup succeeded +/// * Returns Err(e) if there was an error +pub fn handle_try_wake( + result: Result, + num_retries: u32, + config: RetryConfig, +) -> Result, WakeComputeError> { + match result { + Err(err) => match &err { + WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => { + Ok(ControlFlow::Continue(err)) + } + _ => Err(err), + }, + // Ready to try again. + Ok(new) => Ok(ControlFlow::Break(new)), + } +} + +fn report_error(e: &WakeComputeError, retry: bool) { + use crate::console::errors::ApiError; + let kind = match e { + WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress, + WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError, + WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() { + crate::console::messages::Reason::RoleProtected => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::ResourceNotFound => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::ProjectNotFound => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::EndpointNotFound => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::BranchNotFound => { + WakeupFailureKind::ApiConsoleBadRequest + } + crate::console::messages::Reason::RateLimitExceeded => { + WakeupFailureKind::ApiConsoleLocked + } + crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::ActiveTimeQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::ComputeTimeQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::WrittenDataQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::DataTransferQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::LogicalSizeQuotaExceeded => { + WakeupFailureKind::QuotaExceeded + } + crate::console::messages::Reason::Unknown => match e { + ConsoleError { + http_status_code: StatusCode::LOCKED, + ref error, + .. + } if error.contains("written data quota exceeded") + || error.contains("the limit for current plan reached") => + { + WakeupFailureKind::QuotaExceeded + } + ConsoleError { + http_status_code: StatusCode::UNPROCESSABLE_ENTITY, + ref error, + .. + } if error.contains("compute time quota of non-primary branches is exceeded") => { + WakeupFailureKind::QuotaExceeded + } + ConsoleError { + http_status_code: StatusCode::LOCKED, + .. + } => WakeupFailureKind::ApiConsoleLocked, + ConsoleError { + http_status_code: StatusCode::BAD_REQUEST, + .. + } => WakeupFailureKind::ApiConsoleBadRequest, + ConsoleError { + http_status_code, .. + } if http_status_code.is_server_error() => { + WakeupFailureKind::ApiConsoleOtherServerError + } + ConsoleError { .. } => WakeupFailureKind::ApiConsoleOtherError, + }, + }, + WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked, + WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError, + }; + Metrics::get() + .proxy + .connection_failures_breakdown + .inc(ConnectionFailuresBreakdownGroup { + kind, + retry: retry.into(), + }); +} diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index b26386d159..be9072dd8c 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -1,7 +1,6 @@ -mod aimd; mod limit_algorithm; mod limiter; -pub use aimd::Aimd; -pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; -pub use limiter::Limiter; -pub use limiter::{EndpointRateLimiter, RateBucketInfo}; +pub use limit_algorithm::{ + aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, +}; +pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo}; diff --git a/proxy/src/rate_limiter/aimd.rs b/proxy/src/rate_limiter/aimd.rs deleted file mode 100644 index 2c14a54a6c..0000000000 --- a/proxy/src/rate_limiter/aimd.rs +++ /dev/null @@ -1,166 +0,0 @@ -use std::usize; - -use async_trait::async_trait; - -use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample}; - -use super::limiter::Outcome; - -/// Loss-based congestion avoidance. -/// -/// Additive-increase, multiplicative decrease. -/// -/// Adds available currency when: -/// 1. no load-based errors are observed, and -/// 2. the utilisation of the current limit is high. -/// -/// Reduces available concurrency by a factor when load-based errors are detected. -pub struct Aimd { - min_limit: usize, - max_limit: usize, - decrease_factor: f32, - increase_by: usize, - min_utilisation_threshold: f32, -} - -impl Aimd { - pub fn new(config: AimdConfig) -> Self { - Self { - min_limit: config.aimd_min_limit, - max_limit: config.aimd_max_limit, - decrease_factor: config.aimd_decrease_factor, - increase_by: config.aimd_increase_by, - min_utilisation_threshold: config.aimd_min_utilisation_threshold, - } - } -} - -#[async_trait] -impl LimitAlgorithm for Aimd { - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize { - use Outcome::*; - match sample.outcome { - Success => { - let utilisation = sample.in_flight as f32 / old_limit as f32; - - if utilisation > self.min_utilisation_threshold { - let limit = old_limit + self.increase_by; - limit.clamp(self.min_limit, self.max_limit) - } else { - old_limit - } - } - Overload => { - let limit = old_limit as f32 * self.decrease_factor; - - // Floor instead of round, so the limit reduces even with small numbers. - // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 - let limit = limit.floor() as usize; - - limit.clamp(self.min_limit, self.max_limit) - } - } - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use tokio::sync::Notify; - - use super::*; - - use crate::rate_limiter::{Limiter, RateLimiterConfig}; - - #[tokio::test] - async fn should_decrease_limit_on_overload() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let release_notifier = Arc::new(Notify::new()); - - let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone()); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, Some(Outcome::Overload)).await; - release_notifier.notified().await; - assert_eq!(limiter.state().limit(), 5, "overload: decrease"); - } - - #[tokio::test] - async fn should_increase_limit_on_success_when_using_gt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - aimd_increase_by: 1, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - let _token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!(limiter.state().limit(), 5, "success: increase"); - } - - #[tokio::test] - async fn should_not_change_limit_on_success_when_using_lt_util_threshold() { - let config = RateLimiterConfig { - initial_limit: 4, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - assert_eq!( - limiter.state().limit(), - 4, - "success: ignore when < half limit" - ); - } - - #[tokio::test] - async fn should_not_change_limit_when_no_outcome() { - let config = RateLimiterConfig { - initial_limit: 10, - aimd_config: Some(AimdConfig { - aimd_decrease_factor: 0.5, - aimd_min_utilisation_threshold: 0.5, - ..Default::default() - }), - disable: false, - ..Default::default() - }; - - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - limiter.release(token, None).await; - assert_eq!(limiter.state().limit(), 10, "ignore"); - } -} diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 5cd2d5ebb7..3842ce269e 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -1,18 +1,35 @@ //! Algorithms for controlling concurrency limits. -use async_trait::async_trait; -use std::time::Duration; +use parking_lot::Mutex; +use std::{pin::pin, sync::Arc, time::Duration}; +use tokio::{ + sync::Notify, + time::{error::Elapsed, Instant}, +}; -use super::{limiter::Outcome, Aimd}; +use self::aimd::Aimd; -/// An algorithm for controlling a concurrency limit. -#[async_trait] -pub trait LimitAlgorithm: Send + Sync + 'static { - /// Update the concurrency limit in response to a new job completion. - async fn update(&mut self, old_limit: usize, sample: Sample) -> usize; +pub mod aimd; + +/// Whether a job succeeded or failed as a result of congestion/overload. +/// +/// Errors not considered to be caused by overload should be ignored. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Outcome { + /// The job succeeded, or failed in a way unrelated to overload. + Success, + /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal + /// was observed. + Overload, } -/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay). -#[derive(Debug, Clone, PartialEq, Eq)] +/// An algorithm for controlling a concurrency limit. +pub trait LimitAlgorithm: Send + Sync + 'static { + /// Update the concurrency limit in response to a new job completion. + fn update(&self, old_limit: usize, sample: Sample) -> usize; +} + +/// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay). +#[derive(Debug, Clone, PartialEq, Eq, Copy)] pub struct Sample { pub(crate) latency: Duration, /// Jobs in flight when the sample was taken. @@ -20,79 +37,229 @@ pub struct Sample { pub(crate) outcome: Outcome, } -#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)] +#[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)] +#[serde(rename_all = "snake_case")] pub enum RateLimitAlgorithm { - Fixed, #[default] - Aimd, + Fixed, + Aimd { + #[serde(flatten)] + conf: Aimd, + }, } pub struct Fixed; -#[async_trait] impl LimitAlgorithm for Fixed { - async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize { + fn update(&self, old_limit: usize, _sample: Sample) -> usize { old_limit } } -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)] pub struct RateLimiterConfig { - pub disable: bool, + #[serde(flatten)] pub algorithm: RateLimitAlgorithm, - pub timeout: Duration, pub initial_limit: usize, - pub aimd_config: Option, } impl RateLimiterConfig { pub fn create_rate_limit_algorithm(self) -> Box { match self.algorithm { RateLimitAlgorithm::Fixed => Box::new(Fixed), - RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory. + RateLimitAlgorithm::Aimd { conf } => Box::new(conf), } } } -impl Default for RateLimiterConfig { - fn default() -> Self { +pub struct LimiterInner { + alg: Box, + available: usize, + limit: usize, + in_flight: usize, +} + +impl LimiterInner { + fn update_limit(&mut self, latency: Duration, outcome: Option) { + if let Some(outcome) = outcome { + let sample = Sample { + latency, + in_flight: self.in_flight, + outcome, + }; + self.limit = self.alg.update(self.limit, sample); + } + } + + fn take(&mut self, ready: &Notify) -> Option<()> { + if self.available >= 1 { + self.available -= 1; + self.in_flight += 1; + + // tell the next in the queue that there is a permit ready + if self.available >= 1 { + ready.notify_one(); + } + Some(()) + } else { + None + } + } +} + +/// Limits the number of concurrent jobs. +/// +/// Concurrency is limited through the use of [`Token`]s. Acquire a token to run a job, and release the +/// token once the job is finished. +/// +/// The limit will be automatically adjusted based on observed latency (delay) and/or failures +/// caused by overload (loss). +pub struct DynamicLimiter { + config: RateLimiterConfig, + inner: Mutex, + // to notify when a token is available + ready: Notify, +} + +/// A concurrency token, required to run a job. +/// +/// Release the token back to the [`DynamicLimiter`] after the job is complete. +pub struct Token { + start: Instant, + limiter: Option>, +} + +/// A snapshot of the state of the [`DynamicLimiter`]. +/// +/// Not guaranteed to be consistent under high concurrency. +#[derive(Debug, Clone, Copy)] +pub struct LimiterState { + limit: usize, + in_flight: usize, +} + +impl DynamicLimiter { + /// Create a limiter with a given limit control algorithm. + pub fn new(config: RateLimiterConfig) -> Arc { + let ready = Notify::new(); + ready.notify_one(); + + Arc::new(Self { + inner: Mutex::new(LimiterInner { + alg: config.create_rate_limit_algorithm(), + available: config.initial_limit, + limit: config.initial_limit, + in_flight: 0, + }), + ready, + config, + }) + } + + /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. + pub async fn acquire_timeout(self: &Arc, duration: Duration) -> Result { + tokio::time::timeout(duration, self.acquire()).await? + } + + /// Try to acquire a concurrency [Token]. + async fn acquire(self: &Arc) -> Result { + if self.config.initial_limit == 0 { + // If the rate limiter is disabled, we can always acquire a token. + Ok(Token::disabled()) + } else { + let mut notified = pin!(self.ready.notified()); + let mut ready = notified.as_mut().enable(); + loop { + if ready { + let mut inner = self.inner.lock(); + if inner.take(&self.ready).is_some() { + break Ok(Token::new(self.clone())); + } else { + notified.set(self.ready.notified()); + } + } + notified.as_mut().await; + ready = true; + } + } + } + + /// Return the concurrency [Token], along with the outcome of the job. + /// + /// The [Outcome] of the job, and the time taken to perform it, may be used + /// to update the concurrency limit. + /// + /// Set the outcome to `None` to ignore the job. + fn release_inner(&self, start: Instant, outcome: Option) { + tracing::info!("outcome is {:?}", outcome); + if self.config.initial_limit == 0 { + return; + } + + let mut inner = self.inner.lock(); + + inner.update_limit(start.elapsed(), outcome); + + inner.in_flight -= 1; + if inner.in_flight < inner.limit { + inner.available = inner.limit - inner.in_flight; + // At least 1 permit is now available + self.ready.notify_one(); + } + } + + /// The current state of the limiter. + pub fn state(&self) -> LimiterState { + let inner = self.inner.lock(); + LimiterState { + limit: inner.limit, + in_flight: inner.in_flight, + } + } +} + +impl Token { + fn new(limiter: Arc) -> Self { Self { - disable: true, - algorithm: RateLimitAlgorithm::Aimd, - timeout: Duration::from_secs(1), - initial_limit: 100, - aimd_config: Some(AimdConfig::default()), + start: Instant::now(), + limiter: Some(limiter), } } -} - -#[derive(clap::Parser, Clone, Copy, Debug)] -pub struct AimdConfig { - /// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1)] - pub aimd_min_limit: usize, - /// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 1500)] - pub aimd_max_limit: usize, - /// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 10)] - pub aimd_increase_by: usize, - /// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.9)] - pub aimd_decrease_factor: f32, - /// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`. - #[clap(long, default_value_t = 0.8)] - pub aimd_min_utilisation_threshold: f32, -} - -impl Default for AimdConfig { - fn default() -> Self { + pub fn disabled() -> Self { Self { - aimd_min_limit: 1, - aimd_max_limit: 1500, - aimd_increase_by: 10, - aimd_decrease_factor: 0.9, - aimd_min_utilisation_threshold: 0.8, + start: Instant::now(), + limiter: None, + } + } + + pub fn is_disabled(&self) -> bool { + self.limiter.is_none() + } + + pub fn release(mut self, outcome: Outcome) { + self.release_mut(Some(outcome)) + } + + pub fn release_mut(&mut self, outcome: Option) { + if let Some(limiter) = self.limiter.take() { + limiter.release_inner(self.start, outcome); } } } + +impl Drop for Token { + fn drop(&mut self) { + self.release_mut(None) + } +} + +impl LimiterState { + /// The current concurrency limit. + pub fn limit(&self) -> usize { + self.limit + } + /// The number of jobs in flight. + pub fn in_flight(&self) -> usize { + self.in_flight + } +} diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs new file mode 100644 index 0000000000..b39740bb21 --- /dev/null +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -0,0 +1,266 @@ +use super::{LimitAlgorithm, Outcome, Sample}; + +/// Loss-based congestion avoidance. +/// +/// Additive-increase, multiplicative decrease. +/// +/// Adds available currency when: +/// 1. no load-based errors are observed, and +/// 2. the utilisation of the current limit is high. +/// +/// Reduces available concurrency by a factor when load-based errors are detected. +#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)] +pub struct Aimd { + /// Minimum limit for AIMD algorithm. + pub min: usize, + /// Maximum limit for AIMD algorithm. + pub max: usize, + /// Decrease AIMD decrease by value in case of error. + pub dec: f32, + /// Increase AIMD increase by value in case of success. + pub inc: usize, + /// A threshold below which the limit won't be increased. + pub utilisation: f32, +} + +impl LimitAlgorithm for Aimd { + fn update(&self, old_limit: usize, sample: Sample) -> usize { + use Outcome::*; + match sample.outcome { + Success => { + let utilisation = sample.in_flight as f32 / old_limit as f32; + + if utilisation > self.utilisation { + let limit = old_limit + self.inc; + let increased_limit = limit.clamp(self.min, self.max); + if increased_limit > old_limit { + tracing::info!(increased_limit, "limit increased"); + } + + increased_limit + } else { + old_limit + } + } + Overload => { + let limit = old_limit as f32 * self.dec; + + // Floor instead of round, so the limit reduces even with small numbers. + // E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1 + let limit = limit.floor() as usize; + + let limit = limit.clamp(self.min, self.max); + tracing::info!(limit, "limit decreased"); + limit + } + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use crate::rate_limiter::limit_algorithm::{ + DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig, + }; + + use super::*; + + #[tokio::test(start_paused = true)] + async fn increase_decrease() { + let config = RateLimiterConfig { + initial_limit: 1, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 2, + inc: 10, + dec: 0.5, + utilisation: 0.8, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Success); + + assert_eq!(limiter.state().limit(), 2); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Success); + assert_eq!(limiter.state().limit(), 2); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Overload); + assert_eq!(limiter.state().limit(), 1); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + token.release(Outcome::Overload); + assert_eq!(limiter.state().limit(), 1); + } + + #[tokio::test(start_paused = true)] + async fn should_decrease_limit_on_overload() { + let config = RateLimiterConfig { + initial_limit: 10, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 1500, + inc: 10, + dec: 0.5, + utilisation: 0.8, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(100)) + .await + .unwrap(); + token.release(Outcome::Overload); + + assert_eq!(limiter.state().limit(), 5, "overload: decrease"); + } + + #[tokio::test(start_paused = true)] + async fn acquire_timeout_times_out() { + let config = RateLimiterConfig { + initial_limit: 1, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 2, + inc: 10, + dec: 0.5, + utilisation: 0.8, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + let now = tokio::time::Instant::now(); + limiter + .acquire_timeout(Duration::from_secs(1)) + .await + .err() + .unwrap(); + + assert!(now.elapsed() >= Duration::from_secs(1)); + + token.release(Outcome::Success); + + assert_eq!(limiter.state().limit(), 2); + } + + #[tokio::test(start_paused = true)] + async fn should_increase_limit_on_success_when_using_gt_util_threshold() { + let config = RateLimiterConfig { + initial_limit: 4, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 1500, + inc: 1, + dec: 0.5, + utilisation: 0.5, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + let _token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + let _token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + + token.release(Outcome::Success); + assert_eq!(limiter.state().limit(), 5, "success: increase"); + } + + #[tokio::test(start_paused = true)] + async fn should_not_change_limit_on_success_when_using_lt_util_threshold() { + let config = RateLimiterConfig { + initial_limit: 4, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 1500, + inc: 10, + dec: 0.5, + utilisation: 0.5, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + + token.release(Outcome::Success); + assert_eq!( + limiter.state().limit(), + 4, + "success: ignore when < half limit" + ); + } + + #[tokio::test(start_paused = true)] + async fn should_not_change_limit_when_no_outcome() { + let config = RateLimiterConfig { + initial_limit: 10, + algorithm: RateLimitAlgorithm::Aimd { + conf: Aimd { + min: 1, + max: 1500, + inc: 10, + dec: 0.5, + utilisation: 0.5, + }, + }, + }; + + let limiter = DynamicLimiter::new(config); + + let token = limiter + .acquire_timeout(Duration::from_millis(1)) + .await + .unwrap(); + drop(token); + assert_eq!(limiter.state().limit(), 10, "ignore"); + } +} diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index a190b2cf8f..b8c9490696 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,9 +1,10 @@ use std::{ + borrow::Cow, collections::hash_map::RandomState, - hash::BuildHasher, + hash::{BuildHasher, Hash}, sync::{ atomic::{AtomicUsize, Ordering}, - Arc, Mutex, + Mutex, }, }; @@ -11,15 +12,48 @@ use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; use rand::{rngs::StdRng, Rng, SeedableRng}; -use smol_str::SmolStr; -use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; -use tokio::time::{timeout, Duration, Instant}; +use tokio::time::{Duration, Instant}; use tracing::info; -use super::{ - limit_algorithm::{LimitAlgorithm, Sample}, - RateLimiterConfig, -}; +use crate::intern::EndpointIdInt; + +pub struct GlobalRateLimiter { + data: Vec, + info: Vec, +} + +impl GlobalRateLimiter { + pub fn new(info: Vec) -> Self { + Self { + data: vec![ + RateBucket { + start: Instant::now(), + count: 0, + }; + info.len() + ], + info, + } + } + + /// Check that number of connections is below `max_rps` rps. + pub fn check(&mut self) -> bool { + let now = Instant::now(); + + let should_allow_request = self + .data + .iter_mut() + .zip(&self.info) + .all(|(bucket, info)| bucket.should_allow_request(info, now, 1)); + + if should_allow_request { + // only increment the bucket counts if the request will actually be accepted + self.data.iter_mut().for_each(|b| b.inc(1)); + } + + should_allow_request + } +} // Simple per-endpoint rate limiter. // @@ -27,14 +61,11 @@ use super::{ // Purposefully ignore user name and database name as clients can reconnect // with different names, so we'll end up sending some http requests to // the control plane. -// -// We also may save quite a lot of CPU (I think) by bailing out right after we -// saw SNI, before doing TLS handshake. User-side error messages in that case -// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now -// I went with a more expensive way that yields user-friendlier error messages. -pub struct EndpointRateLimiter { - map: DashMap, Hasher>, - info: &'static [RateBucketInfo], +pub type EndpointRateLimiter = BucketRateLimiter; + +pub struct BucketRateLimiter { + map: DashMap, Hasher>, + info: Cow<'static, [RateBucketInfo]>, access_count: AtomicUsize, rand: Mutex, } @@ -46,9 +77,9 @@ struct RateBucket { } impl RateBucket { - fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool { + fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant, n: u32) -> bool { if now - self.start < info.interval { - self.count < info.max_rpi + self.count + n <= info.max_rpi } else { // bucket expired, reset self.count = 0; @@ -58,8 +89,8 @@ impl RateBucket { } } - fn inc(&mut self) { - self.count += 1; + fn inc(&mut self, n: u32) { + self.count += n; } } @@ -72,7 +103,7 @@ pub struct RateBucketInfo { impl std::fmt::Display for RateBucketInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32; + let rps = (self.max_rpi as u64) * 1000 / self.interval.as_millis() as u64; write!(f, "{rps}@{}", humantime::format_duration(self.interval)) } } @@ -103,6 +134,12 @@ impl RateBucketInfo { Self::new(100, Duration::from_secs(600)), ]; + pub const DEFAULT_ENDPOINT_SET: [Self; 3] = [ + Self::new(500, Duration::from_secs(1)), + Self::new(300, Duration::from_secs(60)), + Self::new(200, Duration::from_secs(600)), + ]; + pub fn validate(info: &mut [Self]) -> anyhow::Result<()> { info.sort_unstable_by_key(|info| info.interval); let invalid = info @@ -111,7 +148,7 @@ impl RateBucketInfo { .find(|(a, b)| a.max_rpi > b.max_rpi); if let Some((a, b)) = invalid { bail!( - "invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})", + "invalid bucket RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})", b.max_rpi, a.max_rpi, ); @@ -123,19 +160,24 @@ impl RateBucketInfo { pub const fn new(max_rps: u32, interval: Duration) -> Self { Self { interval, - max_rpi: max_rps * interval.as_millis() as u32 / 1000, + max_rpi: ((max_rps as u64) * (interval.as_millis() as u64) / 1000) as u32, } } } -impl EndpointRateLimiter { - pub fn new(info: &'static [RateBucketInfo]) -> Self { +impl BucketRateLimiter { + pub fn new(info: impl Into>) -> Self { Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new()) } } -impl EndpointRateLimiter { - fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self { +impl BucketRateLimiter { + fn new_with_rand_and_hasher( + info: impl Into>, + rand: R, + hasher: S, + ) -> Self { + let info = info.into(); info!(buckets = ?info, "endpoint rate limiter"); Self { info, @@ -146,7 +188,7 @@ impl EndpointRateLimiter { } /// Check that number of connections to the endpoint is below `max_rps` rps. - pub fn check(&self, endpoint: SmolStr) -> bool { + pub fn check(&self, key: K, n: u32) -> bool { // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map. // worst case memory usage is about: // = 2 * 2048 * 64 * (48B + 72B) @@ -156,7 +198,7 @@ impl EndpointRateLimiter { } let now = Instant::now(); - let mut entry = self.map.entry(endpoint).or_insert_with(|| { + let mut entry = self.map.entry(key).or_insert_with(|| { vec![ RateBucket { start: now, @@ -168,12 +210,12 @@ impl EndpointRateLimiter { let should_allow_request = entry .iter_mut() - .zip(self.info) - .all(|(bucket, info)| bucket.should_allow_request(info, now)); + .zip(&*self.info) + .all(|(bucket, info)| bucket.should_allow_request(info, now, n)); if should_allow_request { // only increment the bucket counts if the request will actually be accepted - entry.iter_mut().for_each(RateBucket::inc); + entry.iter_mut().for_each(|b| b.inc(n)); } should_allow_request @@ -184,7 +226,7 @@ impl EndpointRateLimiter { /// But that way deletion does not aquire mutex on each entry access. pub fn do_gc(&self) { info!( - "cleaning up endpoint rate limiter, current size = {}", + "cleaning up bucket rate limiter, current size = {}", self.map.len() ); let n = self.map.shards().len(); @@ -195,417 +237,16 @@ impl EndpointRateLimiter { } } -/// Limits the number of concurrent jobs. -/// -/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the -/// token once the job is finished. -/// -/// The limit will be automatically adjusted based on observed latency (delay) and/or failures -/// caused by overload (loss). -pub struct Limiter { - limit_algo: AsyncMutex>, - semaphore: std::sync::Arc, - config: RateLimiterConfig, - - // ONLY WRITE WHEN LIMIT_ALGO IS LOCKED - limits: AtomicUsize, - - // ONLY USE ATOMIC ADD/SUB - in_flight: Arc, - - #[cfg(test)] - notifier: Option>, -} - -/// A concurrency token, required to run a job. -/// -/// Release the token back to the [Limiter] after the job is complete. -#[derive(Debug)] -pub struct Token<'t> { - permit: Option>, - start: Instant, - in_flight: Arc, -} - -/// A snapshot of the state of the [Limiter]. -/// -/// Not guaranteed to be consistent under high concurrency. -#[derive(Debug, Clone, Copy)] -pub struct LimiterState { - limit: usize, - in_flight: usize, -} - -/// Whether a job succeeded or failed as a result of congestion/overload. -/// -/// Errors not considered to be caused by overload should be ignored. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Outcome { - /// The job succeeded, or failed in a way unrelated to overload. - Success, - /// The job failed because of overload, e.g. it timed out or an explicit backpressure signal - /// was observed. - Overload, -} - -impl Outcome { - fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self { - match error { - reqwest_middleware::Error::Middleware(_) => Outcome::Success, - reqwest_middleware::Error::Reqwest(e) => { - if let Some(status) = e.status() { - if status.is_server_error() - || reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status - { - Outcome::Overload - } else { - Outcome::Success - } - } else { - Outcome::Success - } - } - } - } - fn from_reqwest_response(response: &reqwest::Response) -> Self { - if response.status().is_server_error() - || response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS - { - Outcome::Overload - } else { - Outcome::Success - } - } -} - -impl Limiter { - /// Create a limiter with a given limit control algorithm. - pub fn new(config: RateLimiterConfig) -> Self { - assert!(config.initial_limit > 0); - Self { - limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()), - semaphore: Arc::new(Semaphore::new(config.initial_limit)), - config, - limits: AtomicUsize::new(config.initial_limit), - in_flight: Arc::new(AtomicUsize::new(0)), - #[cfg(test)] - notifier: None, - } - } - // pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self { - // assert!(initial_limit > 0); - - // Self { - // limit_algo: AsyncMutex::new(limit_algorithm), - // semaphore: Arc::new(Semaphore::new(initial_limit)), - // timeout, - // limits: AtomicUsize::new(initial_limit), - // in_flight: Arc::new(AtomicUsize::new(0)), - // #[cfg(test)] - // notifier: None, - // } - // } - - /// In some cases [Token]s are acquired asynchronously when updating the limit. - #[cfg(test)] - pub fn with_release_notifier(mut self, n: std::sync::Arc) -> Self { - self.notifier = Some(n); - self - } - - /// Try to immediately acquire a concurrency [Token]. - /// - /// Returns `None` if there are none available. - pub fn try_acquire(&self) -> Option { - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - self.semaphore - .try_acquire() - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok() - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Try to acquire a concurrency [Token], waiting for `duration` if there are none available. - /// - /// Returns `None` if there are none available after `duration`. - pub async fn acquire_timeout(&self, duration: Duration) -> Option> { - info!("acquiring token: {:?}", self.semaphore.available_permits()); - let result = if self.config.disable { - // If the rate limiter is disabled, we can always acquire a token. - Some(Token::new(None, self.in_flight.clone())) - } else { - match timeout(duration, self.semaphore.acquire()).await { - Ok(maybe_permit) => maybe_permit - .map(|permit| Token::new(Some(permit), self.in_flight.clone())) - .ok(), - Err(_) => None, - } - }; - if result.is_some() { - self.in_flight.fetch_add(1, Ordering::AcqRel); - } - result - } - - /// Return the concurrency [Token], along with the outcome of the job. - /// - /// The [Outcome] of the job, and the time taken to perform it, may be used - /// to update the concurrency limit. - /// - /// Set the outcome to `None` to ignore the job. - pub async fn release(&self, mut token: Token<'_>, outcome: Option) { - tracing::info!("outcome is {:?}", outcome); - let in_flight = self.in_flight.load(Ordering::Acquire); - let old_limit = self.limits.load(Ordering::Acquire); - let available = if self.config.disable { - 0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0. - } else { - self.semaphore.available_permits() - }; - let total = in_flight + available; - - let mut algo = self.limit_algo.lock().await; - - let new_limit = if let Some(outcome) = outcome { - let sample = Sample { - latency: token.start.elapsed(), - in_flight, - outcome, - }; - algo.update(old_limit, sample).await - } else { - old_limit - }; - tracing::info!("new limit is {}", new_limit); - let actual_limit = if new_limit < total { - token.forget(); - total.saturating_sub(1) - } else { - if !self.config.disable { - self.semaphore.add_permits(new_limit.saturating_sub(total)); - } - new_limit - }; - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["expected"]) - .set(new_limit as i64); - crate::metrics::RATE_LIMITER_LIMIT - .with_label_values(&["actual"]) - .set(actual_limit as i64); - self.limits.store(new_limit, Ordering::Release); - #[cfg(test)] - if let Some(n) = &self.notifier { - n.notify_one(); - } - } - - /// The current state of the limiter. - pub fn state(&self) -> LimiterState { - let limit = self.limits.load(Ordering::Relaxed); - let in_flight = self.in_flight.load(Ordering::Relaxed); - LimiterState { limit, in_flight } - } -} - -impl<'t> Token<'t> { - fn new(permit: Option>, in_flight: Arc) -> Self { - Self { - permit, - start: Instant::now(), - in_flight, - } - } - - pub fn forget(&mut self) { - if let Some(permit) = self.permit.take() { - permit.forget(); - } - } -} - -impl Drop for Token<'_> { - fn drop(&mut self) { - self.in_flight.fetch_sub(1, Ordering::AcqRel); - } -} - -impl LimiterState { - /// The current concurrency limit. - pub fn limit(&self) -> usize { - self.limit - } - /// The number of jobs in flight. - pub fn in_flight(&self) -> usize { - self.in_flight - } -} - -#[async_trait::async_trait] -impl reqwest_middleware::Middleware for Limiter { - async fn handle( - &self, - req: reqwest::Request, - extensions: &mut task_local_extensions::Extensions, - next: reqwest_middleware::Next<'_>, - ) -> reqwest_middleware::Result { - let start = Instant::now(); - let token = self - .acquire_timeout(self.config.timeout) - .await - .ok_or_else(|| { - reqwest_middleware::Error::Middleware( - // TODO: Should we map it into user facing errors? - crate::console::errors::ApiError::Console { - status: crate::http::StatusCode::TOO_MANY_REQUESTS, - text: "Too many requests".into(), - } - .into(), - ) - })?; - info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane"); - crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); - match next.run(req, extensions).await { - Ok(response) => { - self.release(token, Some(Outcome::from_reqwest_response(&response))) - .await; - Ok(response) - } - Err(e) => { - self.release(token, Some(Outcome::from_reqwest_error(&e))) - .await; - Err(e) - } - } - } -} - #[cfg(test)] mod tests { - use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration}; + use std::{hash::BuildHasherDefault, time::Duration}; - use futures::{task::noop_waker_ref, Future}; use rand::SeedableRng; use rustc_hash::FxHasher; - use smol_str::SmolStr; use tokio::time; - use super::{EndpointRateLimiter, Limiter, Outcome}; - use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm}; - - #[tokio::test] - async fn it_works() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 10, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - let token = limiter.try_acquire().unwrap(); - - limiter.release(token, Some(Outcome::Success)).await; - - assert_eq!(limiter.state().limit(), 10); - } - - #[tokio::test] - async fn is_fair() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: false, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - - let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token2_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1))); - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token1" - ); - - limiter.release(token1, Some(Outcome::Success)).await; - // === END TOKEN 1 === - - // === TOKEN 2 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token2" - ); - - assert!( - token3_fut - .as_mut() - .poll(&mut Context::from_waker(noop_waker_ref())) - .is_pending(), - "token is acquired by token2" - ); - - let token2 = token2_fut.await.unwrap(); - - limiter.release(token2, Some(Outcome::Success)).await; - // === END TOKEN 2 === - - // === TOKEN 3 === - assert!( - limiter.try_acquire().is_none(), - "token is acquired by token3" - ); - - let token3 = token3_fut.await.unwrap(); - limiter.release(token3, Some(Outcome::Success)).await; - // === END TOKEN 3 === - - // === TOKEN 4 === - let token4 = limiter.try_acquire().unwrap(); - limiter.release(token4, Some(Outcome::Success)).await; - } - - #[tokio::test] - async fn disable() { - let config = super::RateLimiterConfig { - algorithm: RateLimitAlgorithm::Fixed, - timeout: Duration::from_secs(1), - initial_limit: 1, - disable: true, - ..Default::default() - }; - let limiter = Limiter::new(config); - - // === TOKEN 1 === - let token1 = limiter.try_acquire().unwrap(); - let token2 = limiter.try_acquire().unwrap(); - let state = limiter.state(); - assert_eq!(state.limit(), 1); - assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected. - limiter.release(token1, None).await; - limiter.release(token2, None).await; - } + use super::{BucketRateLimiter, EndpointRateLimiter}; + use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; #[test] fn rate_bucket_rpi() { @@ -636,7 +277,7 @@ mod tests { } #[test] - #[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"] + #[should_panic = "invalid bucket RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"] fn rate_buckets_validate() { let mut rates: Vec = ["300@1s", "10@10s"] .into_iter() @@ -652,42 +293,43 @@ mod tests { .map(|s| s.parse().unwrap()) .collect(); RateBucketInfo::validate(&mut rates).unwrap(); - let limiter = EndpointRateLimiter::new(Vec::leak(rates)); + let limiter = EndpointRateLimiter::new(rates); - let endpoint = SmolStr::from("ep-my-endpoint-1234"); + let endpoint = EndpointId::from("ep-my-endpoint-1234"); + let endpoint = EndpointIdInt::from(endpoint); time::pause(); for _ in 0..100 { - assert!(limiter.check(endpoint.clone())); + assert!(limiter.check(endpoint, 1)); } // more connections fail - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // fail even after 500ms as it's in the same bucket time::advance(time::Duration::from_millis(500)).await; - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // after a full 1s, 100 requests are allowed again time::advance(time::Duration::from_millis(500)).await; for _ in 1..6 { - for _ in 0..100 { - assert!(limiter.check(endpoint.clone())); + for _ in 0..50 { + assert!(limiter.check(endpoint, 2)); } time::advance(time::Duration::from_millis(1000)).await; } // more connections after 600 will exceed the 20rps@30s limit - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // will still fail before the 30 second limit time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await; - assert!(!limiter.check(endpoint.clone())); + assert!(!limiter.check(endpoint, 1)); // after the full 30 seconds, 100 requests are allowed again time::advance(time::Duration::from_millis(1)).await; for _ in 0..100 { - assert!(limiter.check(endpoint.clone())); + assert!(limiter.check(endpoint, 1)); } } @@ -697,13 +339,10 @@ mod tests { let rand = rand::rngs::StdRng::from_seed([1; 32]); let hasher = BuildHasherDefault::::default(); - let limiter = EndpointRateLimiter::new_with_rand_and_hasher( - &RateBucketInfo::DEFAULT_SET, - rand, - hasher, - ); + let limiter = + BucketRateLimiter::new_with_rand_and_hasher(&RateBucketInfo::DEFAULT_SET, rand, hasher); for i in 0..1_000_000 { - limiter.check(format!("{i}").into()); + limiter.check(i, 1); } assert!(limiter.map.len() < 150_000); } diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs new file mode 100644 index 0000000000..a322f0368c --- /dev/null +++ b/proxy/src/redis.rs @@ -0,0 +1,4 @@ +pub mod cancellation_publisher; +pub mod connection_with_credentials_provider; +pub mod elasticache; +pub mod notifications; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs new file mode 100644 index 0000000000..7baf104374 --- /dev/null +++ b/proxy/src/redis/cancellation_publisher.rs @@ -0,0 +1,161 @@ +use std::sync::Arc; + +use pq_proto::CancelKeyData; +use redis::AsyncCommands; +use tokio::sync::Mutex; +use uuid::Uuid; + +use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; + +use super::{ + connection_with_credentials_provider::ConnectionWithCredentialsProvider, + notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}, +}; + +pub trait CancellationPublisherMut: Send + Sync + 'static { + #[allow(async_fn_in_trait)] + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()>; +} + +pub trait CancellationPublisher: Send + Sync + 'static { + #[allow(async_fn_in_trait)] + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()>; +} + +impl CancellationPublisher for () { + async fn try_publish( + &self, + _cancel_key_data: CancelKeyData, + _session_id: Uuid, + ) -> anyhow::Result<()> { + Ok(()) + } +} + +impl CancellationPublisherMut for P { + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { +

::try_publish(self, cancel_key_data, session_id).await + } +} + +impl CancellationPublisher for Option

{ + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + if let Some(p) = self { + p.try_publish(cancel_key_data, session_id).await + } else { + Ok(()) + } + } +} + +impl CancellationPublisher for Arc> { + async fn try_publish( + &self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + self.lock() + .await + .try_publish(cancel_key_data, session_id) + .await + } +} + +pub struct RedisPublisherClient { + client: ConnectionWithCredentialsProvider, + region_id: String, + limiter: GlobalRateLimiter, +} + +impl RedisPublisherClient { + pub fn new( + client: ConnectionWithCredentialsProvider, + region_id: String, + info: &'static [RateBucketInfo], + ) -> anyhow::Result { + Ok(Self { + client, + region_id, + limiter: GlobalRateLimiter::new(info.into()), + }) + } + + async fn publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + let payload = serde_json::to_string(&Notification::Cancel(CancelSession { + region_id: Some(self.region_id.clone()), + cancel_key_data, + session_id, + }))?; + self.client.publish(PROXY_CHANNEL_NAME, payload).await?; + Ok(()) + } + pub async fn try_connect(&mut self) -> anyhow::Result<()> { + match self.client.connect().await { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to connect to redis: {e}"); + return Err(e); + } + } + Ok(()) + } + async fn try_publish_internal( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + if !self.limiter.check() { + tracing::info!("Rate limit exceeded. Skipping cancellation message"); + return Err(anyhow::anyhow!("Rate limit exceeded")); + } + match self.publish(cancel_key_data, session_id).await { + Ok(()) => return Ok(()), + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + } + } + tracing::info!("Publisher is disconnected. Reconnectiong..."); + self.try_connect().await?; + self.publish(cancel_key_data, session_id).await + } +} + +impl CancellationPublisherMut for RedisPublisherClient { + async fn try_publish( + &mut self, + cancel_key_data: CancelKeyData, + session_id: Uuid, + ) -> anyhow::Result<()> { + tracing::info!("publishing cancellation key to Redis"); + match self.try_publish_internal(cancel_key_data, session_id).await { + Ok(()) => { + tracing::info!("cancellation key successfuly published to Redis"); + Ok(()) + } + Err(e) => { + tracing::error!("failed to publish a message: {e}"); + Err(e) + } + } + } +} diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs new file mode 100644 index 0000000000..3a90d911c2 --- /dev/null +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -0,0 +1,237 @@ +use std::{sync::Arc, time::Duration}; + +use futures::FutureExt; +use redis::{ + aio::{ConnectionLike, MultiplexedConnection}, + ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, +}; +use tokio::task::JoinHandle; +use tracing::{error, info}; + +use super::elasticache::CredentialsProvider; + +enum Credentials { + Static(ConnectionInfo), + Dynamic(Arc, redis::ConnectionAddr), +} + +impl Clone for Credentials { + fn clone(&self) -> Self { + match self { + Credentials::Static(info) => Credentials::Static(info.clone()), + Credentials::Dynamic(provider, addr) => { + Credentials::Dynamic(Arc::clone(provider), addr.clone()) + } + } + } +} + +/// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token. +/// Provides PubSub connection without credentials refresh. +pub struct ConnectionWithCredentialsProvider { + credentials: Credentials, + con: Option, + refresh_token_task: Option>, + mutex: tokio::sync::Mutex<()>, +} + +impl Clone for ConnectionWithCredentialsProvider { + fn clone(&self) -> Self { + Self { + credentials: self.credentials.clone(), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } +} + +impl ConnectionWithCredentialsProvider { + pub fn new_with_credentials_provider( + host: String, + port: u16, + credentials_provider: Arc, + ) -> Self { + Self { + credentials: Credentials::Dynamic( + credentials_provider, + redis::ConnectionAddr::TcpTls { + host, + port, + insecure: false, + tls_params: None, + }, + ), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } + + pub fn new_with_static_credentials(params: T) -> Self { + Self { + credentials: Credentials::Static(params.into_connection_info().unwrap()), + con: None, + refresh_token_task: None, + mutex: tokio::sync::Mutex::new(()), + } + } + + async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> { + redis::cmd("PING").query_async(con).await + } + + pub async fn connect(&mut self) -> anyhow::Result<()> { + let _guard = self.mutex.lock().await; + if let Some(con) = self.con.as_mut() { + match Self::ping(con).await { + Ok(()) => { + return Ok(()); + } + Err(e) => { + error!("Error during PING: {e:?}"); + } + } + } else { + info!("Connection is not established"); + } + info!("Establishing a new connection..."); + self.con = None; + if let Some(f) = self.refresh_token_task.take() { + f.abort() + } + let mut con = self + .get_client() + .await? + .get_multiplexed_tokio_connection() + .await?; + if let Credentials::Dynamic(credentials_provider, _) = &self.credentials { + let credentials_provider = credentials_provider.clone(); + let con2 = con.clone(); + let f = tokio::spawn(async move { + let _ = Self::keep_connection(con2, credentials_provider).await; + }); + self.refresh_token_task = Some(f); + } + match Self::ping(&mut con).await { + Ok(()) => { + info!("Connection succesfully established"); + } + Err(e) => { + error!("Connection is broken. Error during PING: {e:?}"); + } + } + self.con = Some(con); + Ok(()) + } + + async fn get_connection_info(&self) -> anyhow::Result { + match &self.credentials { + Credentials::Static(info) => Ok(info.clone()), + Credentials::Dynamic(provider, addr) => { + let (username, password) = provider.provide_credentials().await?; + Ok(ConnectionInfo { + addr: addr.clone(), + redis: RedisConnectionInfo { + db: 0, + username: Some(username), + password: Some(password.clone()), + }, + }) + } + } + } + + async fn get_client(&self) -> anyhow::Result { + let client = redis::Client::open(self.get_connection_info().await?)?; + Ok(client) + } + + // PubSub does not support credentials refresh. + // Requires manual reconnection every 12h. + pub async fn get_async_pubsub(&self) -> anyhow::Result { + Ok(self.get_client().await?.get_async_pubsub().await?) + } + + // The connection lives for 12h. + // It can be prolonged with sending `AUTH` commands with the refreshed token. + // https://docs.aws.amazon.com/AmazonElastiCache/latest/red-ug/auth-iam.html#auth-iam-limits + async fn keep_connection( + mut con: MultiplexedConnection, + credentials_provider: Arc, + ) -> anyhow::Result<()> { + loop { + // The connection lives for 12h, for the sanity check we refresh it every hour. + tokio::time::sleep(Duration::from_secs(60 * 60)).await; + match Self::refresh_token(&mut con, credentials_provider.clone()).await { + Ok(()) => { + info!("Token refreshed"); + } + Err(e) => { + error!("Error during token refresh: {e:?}"); + } + } + } + } + async fn refresh_token( + con: &mut MultiplexedConnection, + credentials_provider: Arc, + ) -> anyhow::Result<()> { + let (user, password) = credentials_provider.provide_credentials().await?; + redis::cmd("AUTH") + .arg(user) + .arg(password) + .query_async(con) + .await?; + Ok(()) + } + /// Sends an already encoded (packed) command into the TCP socket and + /// reads the single response from it. + pub async fn send_packed_command(&mut self, cmd: &redis::Cmd) -> RedisResult { + // Clone connection to avoid having to lock the ArcSwap in write mode + let con = self.con.as_mut().ok_or(redis::RedisError::from(( + redis::ErrorKind::IoError, + "Connection not established", + )))?; + con.send_packed_command(cmd).await + } + + /// Sends multiple already encoded (packed) command into the TCP socket + /// and reads `count` responses from it. This is used to implement + /// pipelining. + pub async fn send_packed_commands( + &mut self, + cmd: &redis::Pipeline, + offset: usize, + count: usize, + ) -> RedisResult> { + // Clone shared connection future to avoid having to lock the ArcSwap in write mode + let con = self.con.as_mut().ok_or(redis::RedisError::from(( + redis::ErrorKind::IoError, + "Connection not established", + )))?; + con.send_packed_commands(cmd, offset, count).await + } +} + +impl ConnectionLike for ConnectionWithCredentialsProvider { + fn req_packed_command<'a>( + &'a mut self, + cmd: &'a redis::Cmd, + ) -> redis::RedisFuture<'a, redis::Value> { + (async move { self.send_packed_command(cmd).await }).boxed() + } + + fn req_packed_commands<'a>( + &'a mut self, + cmd: &'a redis::Pipeline, + offset: usize, + count: usize, + ) -> redis::RedisFuture<'a, Vec> { + (async move { self.send_packed_commands(cmd, offset, count).await }).boxed() + } + + fn get_db(&self) -> i64 { + 0 + } +} diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs new file mode 100644 index 0000000000..eded8250af --- /dev/null +++ b/proxy/src/redis/elasticache.rs @@ -0,0 +1,110 @@ +use std::time::{Duration, SystemTime}; + +use aws_config::meta::credentials::CredentialsProviderChain; +use aws_sdk_iam::config::ProvideCredentials; +use aws_sigv4::http_request::{ + self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, +}; +use tracing::info; + +#[derive(Debug)] +pub struct AWSIRSAConfig { + region: String, + service_name: String, + cluster_name: String, + user_id: String, + token_ttl: Duration, + action: String, +} + +impl AWSIRSAConfig { + pub fn new(region: String, cluster_name: Option, user_id: Option) -> Self { + AWSIRSAConfig { + region, + service_name: "elasticache".to_string(), + cluster_name: cluster_name.unwrap_or_default(), + user_id: user_id.unwrap_or_default(), + // "The IAM authentication token is valid for 15 minutes" + // https://docs.aws.amazon.com/memorydb/latest/devguide/auth-iam.html#auth-iam-limits + token_ttl: Duration::from_secs(15 * 60), + action: "connect".to_string(), + } + } +} + +/// Credentials provider for AWS elasticache authentication. +/// +/// Official documentation: +/// +/// +/// Useful resources: +/// +pub struct CredentialsProvider { + config: AWSIRSAConfig, + credentials_provider: CredentialsProviderChain, +} + +impl CredentialsProvider { + pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self { + CredentialsProvider { + config, + credentials_provider, + } + } + pub async fn provide_credentials(&self) -> anyhow::Result<(String, String)> { + let aws_credentials = self + .credentials_provider + .provide_credentials() + .await? + .into(); + info!("AWS credentials successfully obtained"); + info!("Connecting to Redis with configuration: {:?}", self.config); + let mut settings = SigningSettings::default(); + settings.signature_location = SignatureLocation::QueryParams; + settings.expires_in = Some(self.config.token_ttl); + let signing_params = aws_sigv4::sign::v4::SigningParams::builder() + .identity(&aws_credentials) + .region(&self.config.region) + .name(&self.config.service_name) + .time(SystemTime::now()) + .settings(settings) + .build()? + .into(); + let auth_params = [ + ("Action", &self.config.action), + ("User", &self.config.user_id), + ]; + let auth_params = url::form_urlencoded::Serializer::new(String::new()) + .extend_pairs(auth_params) + .finish(); + let auth_uri = http::Uri::builder() + .scheme("http") + .authority(self.config.cluster_name.as_bytes()) + .path_and_query(format!("/?{auth_params}")) + .build()?; + info!("{}", auth_uri); + + // Convert the HTTP request into a signable request + let signable_request = SignableRequest::new( + "GET", + auth_uri.to_string(), + std::iter::empty(), + SignableBody::Bytes(&[]), + )?; + + // Sign and then apply the signature to the request + let (si, _) = http_request::sign(signable_request, &signing_params)?.into_parts(); + let mut signable_request = http::Request::builder() + .method("GET") + .uri(auth_uri) + .body(())?; + si.apply_to_request_http1x(&mut signable_request); + Ok(( + self.config.user_id.clone(), + signable_request + .uri() + .to_string() + .replacen("http://", "", 1), + )) + } +} diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs new file mode 100644 index 0000000000..87d723d17e --- /dev/null +++ b/proxy/src/redis/notifications.rs @@ -0,0 +1,356 @@ +use std::{convert::Infallible, sync::Arc}; + +use futures::StreamExt; +use pq_proto::CancelKeyData; +use redis::aio::PubSub; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use uuid::Uuid; + +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::{ + cache::project_info::ProjectInfoCache, + cancellation::{CancelMap, CancellationHandler}, + intern::{ProjectIdInt, RoleNameInt}, + metrics::{Metrics, RedisErrors, RedisEventsCount}, +}; + +const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; +pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; +const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); +const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20); + +async fn try_connect(client: &ConnectionWithCredentialsProvider) -> anyhow::Result { + let mut conn = client.get_async_pubsub().await?; + tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`"); + conn.subscribe(CPLANE_CHANNEL_NAME).await?; + tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`"); + conn.subscribe(PROXY_CHANNEL_NAME).await?; + Ok(conn) +} + +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +#[serde(tag = "topic", content = "data")] +pub(crate) enum Notification { + #[serde( + rename = "/allowed_ips_updated", + deserialize_with = "deserialize_json_string" + )] + AllowedIpsUpdate { + allowed_ips_update: AllowedIpsUpdate, + }, + #[serde( + rename = "/password_updated", + deserialize_with = "deserialize_json_string" + )] + PasswordUpdate { password_update: PasswordUpdate }, + #[serde(rename = "/cancel_session")] + Cancel(CancelSession), +} +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct AllowedIpsUpdate { + project_id: ProjectIdInt, +} +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct PasswordUpdate { + project_id: ProjectIdInt, + role_name: RoleNameInt, +} +#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] +pub(crate) struct CancelSession { + pub region_id: Option, + pub cancel_key_data: CancelKeyData, + pub session_id: Uuid, +} + +fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result +where + T: for<'de2> serde::Deserialize<'de2>, + D: serde::Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + serde_json::from_str(&s).map_err(::custom) +} + +struct MessageHandler { + cache: Arc, + cancellation_handler: Arc>, + region_id: String, +} + +impl Clone for MessageHandler { + fn clone(&self) -> Self { + Self { + cache: self.cache.clone(), + cancellation_handler: self.cancellation_handler.clone(), + region_id: self.region_id.clone(), + } + } +} + +impl MessageHandler { + pub fn new( + cache: Arc, + cancellation_handler: Arc>, + region_id: String, + ) -> Self { + Self { + cache, + cancellation_handler, + region_id, + } + } + pub async fn increment_active_listeners(&self) { + self.cache.increment_active_listeners().await; + } + pub async fn decrement_active_listeners(&self) { + self.cache.decrement_active_listeners().await; + } + #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] + async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> { + use Notification::*; + let payload: String = msg.get_payload()?; + tracing::debug!(?payload, "received a message payload"); + + let msg: Notification = match serde_json::from_str(&payload) { + Ok(msg) => msg, + Err(e) => { + Metrics::get().proxy.redis_errors_total.inc(RedisErrors { + channel: msg.get_channel_name(), + }); + tracing::error!("broken message: {e}"); + return Ok(()); + } + }; + tracing::debug!(?msg, "received a message"); + match msg { + Cancel(cancel_session) => { + tracing::Span::current().record( + "session_id", + &tracing::field::display(cancel_session.session_id), + ); + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::CancelSession); + if let Some(cancel_region) = cancel_session.region_id { + // If the message is not for this region, ignore it. + if cancel_region != self.region_id { + return Ok(()); + } + } + // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message. + match self + .cancellation_handler + .cancel_session(cancel_session.cancel_key_data, uuid::Uuid::nil()) + .await + { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to cancel session: {e}"); + } + } + } + _ => { + invalidate_cache(self.cache.clone(), msg.clone()); + if matches!(msg, AllowedIpsUpdate { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::AllowedIpsUpdate); + } else if matches!(msg, PasswordUpdate { .. }) { + Metrics::get() + .proxy + .redis_events_count + .inc(RedisEventsCount::PasswordUpdate); + } + // It might happen that the invalid entry is on the way to be cached. + // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds. + // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message. + let cache = self.cache.clone(); + tokio::spawn(async move { + tokio::time::sleep(INVALIDATION_LAG).await; + invalidate_cache(cache, msg); + }); + } + } + + Ok(()) + } +} + +fn invalidate_cache(cache: Arc, msg: Notification) { + use Notification::*; + match msg { + AllowedIpsUpdate { allowed_ips_update } => { + cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id) + } + PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project( + password_update.project_id, + password_update.role_name, + ), + Cancel(_) => unreachable!("cancel message should be handled separately"), + } +} + +async fn handle_messages( + handler: MessageHandler, + redis: ConnectionWithCredentialsProvider, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + loop { + if cancellation_token.is_cancelled() { + return Ok(()); + } + let mut conn = match try_connect(&redis).await { + Ok(conn) => { + handler.increment_active_listeners().await; + conn + } + Err(e) => { + tracing::error!( + "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}" + ); + tokio::time::sleep(RECONNECT_TIMEOUT).await; + continue; + } + }; + let mut stream = conn.on_message(); + while let Some(msg) = stream.next().await { + match handler.handle_message(msg).await { + Ok(()) => {} + Err(e) => { + tracing::error!("failed to handle message: {e}, will try to reconnect"); + break; + } + } + if cancellation_token.is_cancelled() { + handler.decrement_active_listeners().await; + return Ok(()); + } + } + handler.decrement_active_listeners().await; + } +} + +/// Handle console's invalidation messages. +#[tracing::instrument(name = "redis_notifications", skip_all)] +pub async fn task_main( + redis: ConnectionWithCredentialsProvider, + cache: Arc, + cancel_map: CancelMap, + region_id: String, +) -> anyhow::Result +where + C: ProjectInfoCache + Send + Sync + 'static, +{ + let cancellation_handler = Arc::new(CancellationHandler::<()>::new( + cancel_map, + crate::metrics::CancellationSource::FromRedis, + )); + let handler = MessageHandler::new(cache, cancellation_handler, region_id); + // 6h - 1m. + // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost. + let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60)); + loop { + let cancellation_token = CancellationToken::new(); + interval.tick().await; + + tokio::spawn(handle_messages( + handler.clone(), + redis.clone(), + cancellation_token.clone(), + )); + tokio::spawn(async move { + tokio::time::sleep(std::time::Duration::from_secs(6 * 60 * 60)).await; // 6h. + cancellation_token.cancel(); + }); + } +} + +#[cfg(test)] +mod tests { + use crate::{ProjectId, RoleName}; + + use super::*; + use serde_json::json; + + #[test] + fn parse_allowed_ips() -> anyhow::Result<()> { + let project_id: ProjectId = "new_project".into(); + let data = format!("{{\"project_id\": \"{project_id}\"}}"); + let text = json!({ + "type": "message", + "topic": "/allowed_ips_updated", + "data": data, + "extre_fields": "something" + }) + .to_string(); + + let result: Notification = serde_json::from_str(&text)?; + assert_eq!( + result, + Notification::AllowedIpsUpdate { + allowed_ips_update: AllowedIpsUpdate { + project_id: (&project_id).into() + } + } + ); + + Ok(()) + } + + #[test] + fn parse_password_updated() -> anyhow::Result<()> { + let project_id: ProjectId = "new_project".into(); + let role_name: RoleName = "new_role".into(); + let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}"); + let text = json!({ + "type": "message", + "topic": "/password_updated", + "data": data, + "extre_fields": "something" + }) + .to_string(); + + let result: Notification = serde_json::from_str(&text)?; + assert_eq!( + result, + Notification::PasswordUpdate { + password_update: PasswordUpdate { + project_id: (&project_id).into(), + role_name: (&role_name).into(), + } + } + ); + + Ok(()) + } + #[test] + fn parse_cancel_session() -> anyhow::Result<()> { + let cancel_key_data = CancelKeyData { + backend_pid: 42, + cancel_key: 41, + }; + let uuid = uuid::Uuid::new_v4(); + let msg = Notification::Cancel(CancelSession { + cancel_key_data, + region_id: None, + session_id: uuid, + }); + let text = serde_json::to_string(&msg)?; + let result: Notification = serde_json::from_str(&text)?; + assert_eq!(msg, result); + + let msg = Notification::Cancel(CancelSession { + cancel_key_data, + region_id: Some("region".to_string()), + session_id: uuid, + }); + let text = serde_json::to_string(&msg)?; + let result: Notification = serde_json::from_str(&text)?; + assert_eq!(msg, result,); + + Ok(()) + } +} diff --git a/proxy/src/sasl.rs b/proxy/src/sasl.rs index da1cf21c6a..0811416ca2 100644 --- a/proxy/src/sasl.rs +++ b/proxy/src/sasl.rs @@ -10,7 +10,7 @@ mod channel_binding; mod messages; mod stream; -use crate::error::UserFacingError; +use crate::error::{ReportableError, UserFacingError}; use std::io; use thiserror::Error; @@ -33,6 +33,9 @@ pub enum Error { #[error("Internal error: missing digest")] MissingBinding, + #[error("could not decode salt: {0}")] + Base64(#[from] base64::DecodeError), + #[error(transparent)] Io(#[from] io::Error), } @@ -48,6 +51,19 @@ impl UserFacingError for Error { } } +impl ReportableError for Error { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + Error::ChannelBindingFailed(_) => crate::error::ErrorKind::User, + Error::ChannelBindingBadMethod(_) => crate::error::ErrorKind::User, + Error::BadClientMessage(_) => crate::error::ErrorKind::User, + Error::MissingBinding => crate::error::ErrorKind::Service, + Error::Base64(_) => crate::error::ErrorKind::ControlPlane, + Error::Io(_) => crate::error::ErrorKind::ClientDisconnect, + } + } +} + /// A convenient result type for SASL exchange. pub type Result = std::result::Result; diff --git a/proxy/src/scram.rs b/proxy/src/scram.rs index 49a7a13043..862facb4e5 100644 --- a/proxy/src/scram.rs +++ b/proxy/src/scram.rs @@ -6,14 +6,14 @@ //! * //! * +mod countmin; mod exchange; mod key; mod messages; +mod pbkdf2; mod secret; mod signature; - -#[cfg(any(test, doc))] -mod password; +pub mod threadpool; pub use exchange::{exchange, Exchange}; pub use key::ScramKey; @@ -59,27 +59,23 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use crate::sasl::{Mechanism, Step}; + use crate::{ + intern::EndpointIdInt, + sasl::{Mechanism, Step}, + EndpointId, + }; - use super::{password::SaltedPassword, Exchange, ServerSecret}; + use super::{threadpool::ThreadPool, Exchange, ServerSecret}; #[test] - fn happy_path() { + fn snapshot() { let iterations = 4096; - let salt_base64 = "QSXCR+Q6sek8bf92"; - let pw = SaltedPassword::new( - b"pencil", - base64::decode(salt_base64).unwrap().as_slice(), - iterations, - ); + let salt = "QSXCR+Q6sek8bf92"; + let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8="; + let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo="; + let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",); + let secret = ServerSecret::parse(&secret).unwrap(); - let secret = ServerSecret { - iterations, - salt_base64: salt_base64.to_owned(), - stored_key: pw.client_key().sha256(), - server_key: pw.server_key(), - doomed: false, - }; const NONCE: [u8; 18] = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ]; @@ -121,4 +117,32 @@ mod tests { ] ); } + + async fn run_round_trip_test(server_password: &str, client_password: &str) { + let pool = ThreadPool::new(1); + + let ep = EndpointId::from("foo"); + let ep = EndpointIdInt::from(ep); + + let scram_secret = ServerSecret::build(server_password).await.unwrap(); + let outcome = super::exchange(&pool, ep, &scram_secret, client_password.as_bytes()) + .await + .unwrap(); + + match outcome { + crate::sasl::Outcome::Success(_) => {} + crate::sasl::Outcome::Failure(r) => panic!("{r}"), + } + } + + #[tokio::test] + async fn round_trip() { + run_round_trip_test("pencil", "pencil").await + } + + #[tokio::test] + #[should_panic(expected = "password doesn't match")] + async fn failure() { + run_round_trip_test("pencil", "eraser").await + } } diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs new file mode 100644 index 0000000000..f2b794e5fe --- /dev/null +++ b/proxy/src/scram/countmin.rs @@ -0,0 +1,173 @@ +use std::hash::Hash; + +/// estimator of hash jobs per second. +/// +pub struct CountMinSketch { + // one for each depth + hashers: Vec, + width: usize, + depth: usize, + // buckets, width*depth + buckets: Vec, +} + +impl CountMinSketch { + /// Given parameters (ε, δ), + /// set width = ceil(e/ε) + /// set depth = ceil(ln(1/δ)) + /// + /// guarantees: + /// actual <= estimate + /// estimate <= actual + ε * N with probability 1 - δ + /// where N is the cardinality of the stream + pub fn with_params(epsilon: f64, delta: f64) -> Self { + CountMinSketch::new( + (std::f64::consts::E / epsilon).ceil() as usize, + (1.0_f64 / delta).ln().ceil() as usize, + ) + } + + fn new(width: usize, depth: usize) -> Self { + Self { + #[cfg(test)] + hashers: (0..depth) + .map(|i| { + // digits of pi for good randomness + ahash::RandomState::with_seeds( + 314159265358979323, + 84626433832795028, + 84197169399375105, + 82097494459230781 + i as u64, + ) + }) + .collect(), + #[cfg(not(test))] + hashers: (0..depth).map(|_| ahash::RandomState::new()).collect(), + width, + depth, + buckets: vec![0; width * depth], + } + } + + pub fn inc_and_return(&mut self, t: &T, x: u32) -> u32 { + let mut min = u32::MAX; + for row in 0..self.depth { + let col = (self.hashers[row].hash_one(t) as usize) % self.width; + + let row = &mut self.buckets[row * self.width..][..self.width]; + row[col] = row[col].saturating_add(x); + min = std::cmp::min(min, row[col]); + } + min + } + + pub fn reset(&mut self) { + self.buckets.clear(); + self.buckets.resize(self.width * self.depth, 0); + } +} + +#[cfg(test)] +mod tests { + use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + + use super::CountMinSketch; + + fn eval_precision(n: usize, p: f64, q: f64) -> usize { + // fixed value of phi for consistent test + let mut rng = StdRng::seed_from_u64(16180339887498948482); + + #[allow(non_snake_case)] + let mut N = 0; + + let mut ids = vec![]; + + for _ in 0..n { + // number of insert operations + let n = rng.gen_range(1..100); + // number to insert at once + let m = rng.gen_range(1..4096); + + let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid(); + ids.push((id, n, m)); + + // N = sum(actual) + N += n * m; + } + + // q% of counts will be within p of the actual value + let mut sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); + + dbg!(sketch.buckets.len()); + + // insert a bunch of entries in a random order + let mut ids2 = ids.clone(); + while !ids2.is_empty() { + ids2.shuffle(&mut rng); + + let mut i = 0; + while i < ids2.len() { + sketch.inc_and_return(&ids2[i].0, ids2[i].1); + ids2[i].2 -= 1; + if ids2[i].2 == 0 { + ids2.remove(i); + } else { + i += 1; + } + } + } + + let mut within_p = 0; + for (id, n, m) in ids { + let actual = n * m; + let estimate = sketch.inc_and_return(&id, 0); + + // This estimate has the guarantee that actual <= estimate + assert!(actual <= estimate); + + // This estimate has the guarantee that estimate <= actual + εN with probability 1 - δ. + // ε = p / N, δ = 1 - q; + // therefore, estimate <= actual + p with probability q. + if estimate as f64 <= actual as f64 + p { + within_p += 1; + } + } + within_p + } + + #[test] + fn precision() { + assert_eq!(eval_precision(100, 100.0, 0.99), 100); + assert_eq!(eval_precision(1000, 100.0, 0.99), 1000); + assert_eq!(eval_precision(100, 4096.0, 0.99), 100); + assert_eq!(eval_precision(1000, 4096.0, 0.99), 1000); + + // seems to be more precise than the literature indicates? + // probably numbers are too small to truly represent the probabilities. + assert_eq!(eval_precision(100, 4096.0, 0.90), 100); + assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000); + assert_eq!(eval_precision(100, 4096.0, 0.1), 98); + assert_eq!(eval_precision(1000, 4096.0, 0.1), 991); + } + + // returns memory usage in bytes, and the time complexity per insert. + fn eval_cost(p: f64, q: f64) -> (usize, usize) { + #[allow(non_snake_case)] + // N = sum(actual) + // Let's assume 1021 samples, all of 4096 + let N = 1021 * 4096; + let sketch = CountMinSketch::with_params(p / N as f64, 1.0 - q); + + let memory = std::mem::size_of::() * sketch.buckets.len(); + let time = sketch.depth; + (memory, time) + } + + #[test] + fn memory_usage() { + assert_eq!(eval_cost(100.0, 0.99), (2273580, 5)); + assert_eq!(eval_cost(4096.0, 0.99), (55520, 5)); + assert_eq!(eval_cost(4096.0, 0.90), (33312, 3)); + assert_eq!(eval_cost(4096.0, 0.1), (11104, 1)); + } +} diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 9af7db5201..d0adbc780e 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -2,14 +2,19 @@ use std::convert::Infallible; -use postgres_protocol::authentication::sasl::ScramSha256; +use hmac::{Hmac, Mac}; +use sha2::Sha256; use super::messages::{ ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN, }; +use super::pbkdf2::Pbkdf2; use super::secret::ServerSecret; use super::signature::SignatureBuilder; +use super::threadpool::ThreadPool; +use super::ScramKey; use crate::config; +use crate::intern::EndpointIdInt; use crate::sasl::{self, ChannelBinding, Error as SaslError}; /// The only channel binding mode we currently support. @@ -71,40 +76,45 @@ impl<'a> Exchange<'a> { } } -pub fn exchange( +// copied from +async fn derive_client_key( + pool: &ThreadPool, + endpoint: EndpointIdInt, + password: &[u8], + salt: &[u8], + iterations: u32, +) -> ScramKey { + let salted_password = pool + .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations)) + .await + .expect("job should not be cancelled"); + + let make_key = |name| { + let key = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes") + .chain_update(name) + .finalize(); + + <[u8; 32]>::from(key.into_bytes()) + }; + + make_key(b"Client Key").into() +} + +pub async fn exchange( + pool: &ThreadPool, + endpoint: EndpointIdInt, secret: &ServerSecret, - mut client: ScramSha256, - tls_server_end_point: config::TlsServerEndPoint, + password: &[u8], ) -> sasl::Result> { - use sasl::Step::*; + let salt = base64::decode(&secret.salt_base64)?; + let client_key = derive_client_key(pool, endpoint, password, &salt, secret.iterations).await; - let init = SaslInitial { - nonce: rand::random, - }; - - let client_first = std::str::from_utf8(client.message()) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - let sent = match init.transition(secret, &tls_server_end_point, client_first)? { - Continue(sent, server_first) => { - client.update(server_first.as_bytes())?; - sent - } - Success(x, _) => match x {}, - Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), - }; - - let client_final = std::str::from_utf8(client.message()) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - let keys = match sent.transition(secret, &tls_server_end_point, client_final)? { - Success(keys, server_final) => { - client.finish(server_final.as_bytes())?; - keys - } - Continue(x, _) => match x {}, - Failure(msg) => return Ok(sasl::Outcome::Failure(msg)), - }; - - Ok(sasl::Outcome::Success(keys)) + if secret.is_password_invalid(&client_key).into() { + Ok(sasl::Outcome::Failure("password doesn't match")) + } else { + Ok(sasl::Outcome::Success(client_key)) + } } impl SaslInitial { @@ -185,7 +195,7 @@ impl SaslSentInner { .derive_client_key(&client_final_message.proof); // Auth fails either if keys don't match or it's pre-determined to fail. - if client_key.sha256() != secret.stored_key || secret.doomed { + if secret.is_password_invalid(&client_key).into() { return Ok(sasl::Step::Failure("password doesn't match")); } diff --git a/proxy/src/scram/key.rs b/proxy/src/scram/key.rs index bd93fb2b70..32a3dbd203 100644 --- a/proxy/src/scram/key.rs +++ b/proxy/src/scram/key.rs @@ -1,17 +1,31 @@ //! Tools for client/server/stored key management. +use subtle::ConstantTimeEq; + /// Faithfully taken from PostgreSQL. pub const SCRAM_KEY_LEN: usize = 32; -/// One of the keys derived from the [password](super::password::SaltedPassword). +/// One of the keys derived from the user's password. /// We use the same structure for all keys, i.e. /// `ClientKey`, `StoredKey`, and `ServerKey`. -#[derive(Clone, Default, PartialEq, Eq)] +#[derive(Clone, Default, Eq, Debug)] #[repr(transparent)] pub struct ScramKey { bytes: [u8; SCRAM_KEY_LEN], } +impl PartialEq for ScramKey { + fn eq(&self, other: &Self) -> bool { + self.ct_eq(other).into() + } +} + +impl ConstantTimeEq for ScramKey { + fn ct_eq(&self, other: &Self) -> subtle::Choice { + self.bytes.ct_eq(&other.bytes) + } +} + impl ScramKey { pub fn sha256(&self) -> Self { super::sha256([self.as_ref()]).into() diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index b59baec508..cf677a3334 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -32,8 +32,6 @@ pub struct ClientFirstMessage<'a> { pub bare: &'a str, /// Channel binding mode. pub cbind_flag: ChannelBinding<&'a str>, - /// (Client username)[]. - pub username: &'a str, /// Client nonce. pub nonce: &'a str, } @@ -58,6 +56,14 @@ impl<'a> ClientFirstMessage<'a> { // In theory, these might be preceded by "reserved-mext" (i.e. "m=") let username = parts.next()?.strip_prefix("n=")?; + + // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14 + if !username.is_empty() { + tracing::warn!(username, "scram username provided, but is not expected") + // TODO(conrad): + // return None; + } + let nonce = parts.next()?.strip_prefix("r=")?; // Validate but ignore auth extensions @@ -66,7 +72,6 @@ impl<'a> ClientFirstMessage<'a> { Some(Self { bare, cbind_flag, - username, nonce, }) } @@ -188,24 +193,44 @@ mod tests { // (Almost) real strings captured during debug sessions let cases = [ - (NotSupportedClient, "n,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"), - (NotSupportedServer, "y,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"), + (NotSupportedClient, "n,,n=,r=t8JwklwKecDLwSsA72rHmVju"), + (NotSupportedServer, "y,,n=,r=t8JwklwKecDLwSsA72rHmVju"), ( Required("tls-server-end-point"), - "p=tls-server-end-point,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju", + "p=tls-server-end-point,,n=,r=t8JwklwKecDLwSsA72rHmVju", ), ]; for (cb, input) in cases { let msg = ClientFirstMessage::parse(input).unwrap(); - assert_eq!(msg.bare, "n=pepe,r=t8JwklwKecDLwSsA72rHmVju"); - assert_eq!(msg.username, "pepe"); + assert_eq!(msg.bare, "n=,r=t8JwklwKecDLwSsA72rHmVju"); assert_eq!(msg.nonce, "t8JwklwKecDLwSsA72rHmVju"); assert_eq!(msg.cbind_flag, cb); } } + #[test] + fn parse_client_first_message_with_invalid_gs2_authz() { + assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none()) + } + + #[test] + fn parse_client_first_message_with_extra_params() { + let msg = ClientFirstMessage::parse("n,,n=,r=nonce,a=foo,b=bar,c=baz").unwrap(); + assert_eq!(msg.bare, "n=,r=nonce,a=foo,b=bar,c=baz"); + assert_eq!(msg.nonce, "nonce"); + assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient); + } + + #[test] + fn parse_client_first_message_with_extra_params_invalid() { + // must be of the form `=<...>` + assert!(ClientFirstMessage::parse("n,,n=,r=nonce,abc=foo").is_none()); + assert!(ClientFirstMessage::parse("n,,n=,r=nonce,1=foo").is_none()); + assert!(ClientFirstMessage::parse("n,,n=,r=nonce,a").is_none()); + } + #[test] fn parse_client_final_message() { let input = [ diff --git a/proxy/src/scram/password.rs b/proxy/src/scram/password.rs deleted file mode 100644 index 022f2842dd..0000000000 --- a/proxy/src/scram/password.rs +++ /dev/null @@ -1,74 +0,0 @@ -//! Password hashing routines. - -use super::key::ScramKey; - -pub const SALTED_PASSWORD_LEN: usize = 32; - -/// Salted hashed password is essential for [key](super::key) derivation. -#[repr(transparent)] -pub struct SaltedPassword { - bytes: [u8; SALTED_PASSWORD_LEN], -} - -impl SaltedPassword { - /// See `scram-common.c : scram_SaltedPassword` for details. - /// Further reading: (see `PBKDF2`). - pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { - pbkdf2::pbkdf2_hmac_array::(password, salt, iterations).into() - } - - /// Derive `ClientKey` from a salted hashed password. - pub fn client_key(&self) -> ScramKey { - super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into() - } - - /// Derive `ServerKey` from a salted hashed password. - pub fn server_key(&self) -> ScramKey { - super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into() - } -} - -impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword { - #[inline(always)] - fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self { - Self { bytes } - } -} - -#[cfg(test)] -mod tests { - use super::SaltedPassword; - - fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword { - let one = 1_u32.to_be_bytes(); // magic - - let mut current = super::super::hmac_sha256(password, [salt, &one]); - let mut result = current; - for _ in 1..iterations { - current = super::super::hmac_sha256(password, [current.as_ref()]); - // TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094 - for (i, x) in current.iter().enumerate() { - result[i] ^= x; - } - } - - result.into() - } - - #[test] - fn pbkdf2() { - let password = "a-very-secure-password"; - let salt = "such-a-random-salt"; - let iterations = 4096; - let output = [ - 203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211, - 101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42, - ]; - - let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations); - let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations); - - assert_eq!(actual.bytes, output); - assert_eq!(actual.bytes, expected.bytes); - } -} diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs new file mode 100644 index 0000000000..a803ba7e1b --- /dev/null +++ b/proxy/src/scram/pbkdf2.rs @@ -0,0 +1,89 @@ +use hmac::{ + digest::{consts::U32, generic_array::GenericArray}, + Hmac, Mac, +}; +use sha2::Sha256; + +pub struct Pbkdf2 { + hmac: Hmac, + prev: GenericArray, + hi: GenericArray, + iterations: u32, +} + +// inspired from +impl Pbkdf2 { + pub fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self { + let hmac = + Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); + + let prev = hmac + .clone() + .chain_update(salt) + .chain_update(1u32.to_be_bytes()) + .finalize() + .into_bytes(); + + Self { + hmac, + // one consumed for the hash above + iterations: iterations - 1, + hi: prev, + prev, + } + } + + pub fn cost(&self) -> u32 { + (self.iterations).clamp(0, 4096) + } + + pub fn turn(&mut self) -> std::task::Poll<[u8; 32]> { + let Self { + hmac, + prev, + hi, + iterations, + } = self; + + // only do 4096 iterations per turn before sharing the thread for fairness + let n = (*iterations).clamp(0, 4096); + for _ in 0..n { + *prev = hmac.clone().chain_update(*prev).finalize().into_bytes(); + + for (hi, prev) in hi.iter_mut().zip(*prev) { + *hi ^= prev; + } + } + + *iterations -= n; + if *iterations == 0 { + std::task::Poll::Ready((*hi).into()) + } else { + std::task::Poll::Pending + } + } +} + +#[cfg(test)] +mod tests { + use super::Pbkdf2; + use pbkdf2::pbkdf2_hmac_array; + use sha2::Sha256; + + #[test] + fn works() { + let salt = b"sodium chloride"; + let pass = b"Ne0n_!5_50_C007"; + + let mut job = Pbkdf2::start(pass, salt, 600000); + let hash = loop { + let std::task::Poll::Ready(hash) = job.turn() else { + continue; + }; + break hash; + }; + + let expected = pbkdf2_hmac_array::(pass, salt, 600000); + assert_eq!(hash, expected) + } +} diff --git a/proxy/src/scram/secret.rs b/proxy/src/scram/secret.rs index 9e74e07af1..44c4f9e44a 100644 --- a/proxy/src/scram/secret.rs +++ b/proxy/src/scram/secret.rs @@ -1,11 +1,13 @@ //! Tools for SCRAM server secret management. +use subtle::{Choice, ConstantTimeEq}; + use super::base64_decode_array; use super::key::ScramKey; -/// Server secret is produced from [password](super::password::SaltedPassword) +/// Server secret is produced from user's password, /// and is used throughout the authentication process. -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq, Debug)] pub struct ServerSecret { /// Number of iterations for `PBKDF2` function. pub iterations: u32, @@ -40,16 +42,21 @@ impl ServerSecret { Some(secret) } + pub fn is_password_invalid(&self, client_key: &ScramKey) -> Choice { + // constant time to not leak partial key match + client_key.sha256().ct_ne(&self.stored_key) | Choice::from(self.doomed as u8) + } + /// To avoid revealing information to an attacker, we use a /// mocked server secret even if the user doesn't exist. /// See `auth-scram.c : mock_scram_secret` for details. - pub fn mock(user: &str, nonce: [u8; 32]) -> Self { - // Refer to `auth-scram.c : scram_mock_salt`. - let mocked_salt = super::sha256([user.as_bytes(), &nonce]); - + pub fn mock(nonce: [u8; 32]) -> Self { Self { - iterations: 4096, - salt_base64: base64::encode(mocked_salt), + // this doesn't reveal much information as we're going to use + // iteration count 1 for our generated passwords going forward. + // PG16 users can set iteration count=1 already today. + iterations: 1, + salt_base64: base64::encode(nonce), stored_key: ScramKey::default(), server_key: ScramKey::default(), doomed: true, @@ -59,21 +66,8 @@ impl ServerSecret { /// Build a new server secret from the prerequisites. /// XXX: We only use this function in tests. #[cfg(test)] - pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option { - // TODO: implement proper password normalization required by the RFC - if !password.is_ascii() { - return None; - } - - let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations); - - Some(Self { - iterations, - salt_base64: base64::encode(salt), - stored_key: password.client_key().sha256(), - server_key: password.server_key(), - doomed: false, - }) + pub async fn build(password: &str) -> Option { + Self::parse(&postgres_protocol::password::scram_sha_256(password.as_bytes()).await) } } @@ -103,20 +97,4 @@ mod tests { assert_eq!(base64::encode(parsed.stored_key), stored_key); assert_eq!(base64::encode(parsed.server_key), server_key); } - - #[test] - fn build_scram_secret() { - let salt = b"salt"; - let secret = ServerSecret::build("password", salt, 4096).unwrap(); - assert_eq!(secret.iterations, 4096); - assert_eq!(secret.salt_base64, base64::encode(salt)); - assert_eq!( - base64::encode(secret.stored_key.as_ref()), - "lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ=" - ); - assert_eq!( - base64::encode(secret.server_key.as_ref()), - "ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw=" - ); - } } diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs new file mode 100644 index 0000000000..7701b869a3 --- /dev/null +++ b/proxy/src/scram/threadpool.rs @@ -0,0 +1,321 @@ +//! Custom threadpool implementation for password hashing. +//! +//! Requirements: +//! 1. Fairness per endpoint. +//! 2. Yield support for high iteration counts. + +use std::sync::{ + atomic::{AtomicU64, Ordering}, + Arc, +}; + +use crossbeam_deque::{Injector, Stealer, Worker}; +use itertools::Itertools; +use parking_lot::{Condvar, Mutex}; +use rand::Rng; +use rand::{rngs::SmallRng, SeedableRng}; +use tokio::sync::oneshot; + +use crate::{ + intern::EndpointIdInt, + metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}, + scram::countmin::CountMinSketch, +}; + +use super::pbkdf2::Pbkdf2; + +pub struct ThreadPool { + queue: Injector, + stealers: Vec>, + parkers: Vec<(Condvar, Mutex)>, + /// bitpacked representation. + /// lower 8 bits = number of sleeping threads + /// next 8 bits = number of idle threads (searching for work) + counters: AtomicU64, + + pub metrics: Arc, +} + +#[derive(PartialEq)] +enum ThreadState { + Parked, + Active, +} + +impl ThreadPool { + pub fn new(n_workers: u8) -> Arc { + let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec(); + let stealers = workers.iter().map(|w| w.stealer()).collect_vec(); + + let parkers = (0..n_workers) + .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active))) + .collect_vec(); + + let pool = Arc::new(Self { + queue: Injector::new(), + stealers, + parkers, + // threads start searching for work + counters: AtomicU64::new((n_workers as u64) << 8), + metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), + }); + + for (i, worker) in workers.into_iter().enumerate() { + let pool = Arc::clone(&pool); + std::thread::spawn(move || thread_rt(pool, worker, i)); + } + + pool + } + + pub fn spawn_job( + &self, + endpoint: EndpointIdInt, + pbkdf2: Pbkdf2, + ) -> oneshot::Receiver<[u8; 32]> { + let (tx, rx) = oneshot::channel(); + + let queue_was_empty = self.queue.is_empty(); + + self.metrics.injector_queue_depth.inc(); + self.queue.push(JobSpec { + response: tx, + pbkdf2, + endpoint, + }); + + // inspired from + let counts = self.counters.load(Ordering::SeqCst); + let num_awake_but_idle = (counts >> 8) & 0xff; + let num_sleepers = counts & 0xff; + + // If the queue is non-empty, then we always wake up a worker + // -- clearly the existing idle jobs aren't enough. Otherwise, + // check to see if we have enough idle workers. + if !queue_was_empty || num_awake_but_idle == 0 { + let num_to_wake = Ord::min(1, num_sleepers); + self.wake_any_threads(num_to_wake); + } + + rx + } + + #[cold] + fn wake_any_threads(&self, mut num_to_wake: u64) { + if num_to_wake > 0 { + for i in 0..self.parkers.len() { + if self.wake_specific_thread(i) { + num_to_wake -= 1; + if num_to_wake == 0 { + return; + } + } + } + } + } + + fn wake_specific_thread(&self, index: usize) -> bool { + let (condvar, lock) = &self.parkers[index]; + + let mut state = lock.lock(); + if *state == ThreadState::Parked { + condvar.notify_one(); + + // When the thread went to sleep, it will have incremented + // this value. When we wake it, its our job to decrement + // it. We could have the thread do it, but that would + // introduce a delay between when the thread was + // *notified* and when this counter was decremented. That + // might mislead people with new work into thinking that + // there are sleeping threads that they should try to + // wake, when in fact there is nothing left for them to + // do. + self.counters.fetch_sub(1, Ordering::SeqCst); + *state = ThreadState::Active; + + true + } else { + false + } + } + + fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker) -> Option { + // announce thread as idle + self.counters.fetch_add(256, Ordering::SeqCst); + + // try steal from the global queue + loop { + match self.queue.steal_batch_and_pop(worker) { + crossbeam_deque::Steal::Success(job) => { + self.metrics + .injector_queue_depth + .set(self.queue.len() as i64); + // no longer idle + self.counters.fetch_sub(256, Ordering::SeqCst); + return Some(job); + } + crossbeam_deque::Steal::Retry => continue, + crossbeam_deque::Steal::Empty => break, + } + } + + // try steal from our neighbours + loop { + let mut retry = false; + let start = rng.gen_range(0..self.stealers.len()); + let job = (start..self.stealers.len()) + .chain(0..start) + .filter(|i| *i != skip) + .find_map( + |victim| match self.stealers[victim].steal_batch_and_pop(worker) { + crossbeam_deque::Steal::Success(job) => Some(job), + crossbeam_deque::Steal::Empty => None, + crossbeam_deque::Steal::Retry => { + retry = true; + None + } + }, + ); + if job.is_some() { + // no longer idle + self.counters.fetch_sub(256, Ordering::SeqCst); + return job; + } + if !retry { + return None; + } + } + } +} + +fn thread_rt(pool: Arc, worker: Worker, index: usize) { + /// interval when we should steal from the global queue + /// so that tail latencies are managed appropriately + const STEAL_INTERVAL: usize = 61; + + /// How often to reset the sketch values + const SKETCH_RESET_INTERVAL: usize = 1021; + + let mut rng = SmallRng::from_entropy(); + + // used to determine whether we should temporarily skip tasks for fairness. + // 99% of estimates will overcount by no more than 4096 samples + let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01); + + let (condvar, lock) = &pool.parkers[index]; + + 'wait: loop { + // wait for notification of work + { + let mut lock = lock.lock(); + + // queue is empty + pool.metrics + .worker_queue_depth + .set(ThreadPoolWorkerId(index), 0); + + // subtract 1 from idle count, add 1 to sleeping count. + pool.counters.fetch_sub(255, Ordering::SeqCst); + + *lock = ThreadState::Parked; + condvar.wait(&mut lock); + } + + for i in 0.. { + let mut job = match worker + .pop() + .or_else(|| pool.steal(&mut rng, index, &worker)) + { + Some(job) => job, + None => continue 'wait, + }; + + pool.metrics + .worker_queue_depth + .set(ThreadPoolWorkerId(index), worker.len() as i64); + + // receiver is closed, cancel the task + if !job.response.is_closed() { + let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost()); + + const P: f64 = 2000.0; + // probability decreases as rate increases. + // lower probability, higher chance of being skipped + // + // estimates (rate in terms of 4096 rounds): + // rate = 0 => probability = 100% + // rate = 10 => probability = 71.3% + // rate = 50 => probability = 62.1% + // rate = 500 => probability = 52.3% + // rate = 1021 => probability = 49.8% + // + // My expectation is that the pool queue will only begin backing up at ~1000rps + // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above + // are in requests per second. + let probability = P.ln() / (P + rate as f64).ln(); + if pool.queue.len() > 32 || rng.gen_bool(probability) { + pool.metrics + .worker_task_turns_total + .inc(ThreadPoolWorkerId(index)); + + match job.pbkdf2.turn() { + std::task::Poll::Ready(result) => { + let _ = job.response.send(result); + } + std::task::Poll::Pending => worker.push(job), + } + } else { + pool.metrics + .worker_task_skips_total + .inc(ThreadPoolWorkerId(index)); + + // skip for now + worker.push(job) + } + } + + // if we get stuck with a few long lived jobs in the queue + // it's better to try and steal from the queue too for fairness + if i % STEAL_INTERVAL == 0 { + let _ = pool.queue.steal_batch(&worker); + } + + if i % SKETCH_RESET_INTERVAL == 0 { + sketch.reset(); + } + } + } +} + +struct JobSpec { + response: oneshot::Sender<[u8; 32]>, + pbkdf2: Pbkdf2, + endpoint: EndpointIdInt, +} + +#[cfg(test)] +mod tests { + use crate::EndpointId; + + use super::*; + + #[tokio::test] + async fn hash_is_correct() { + let pool = ThreadPool::new(1); + + let ep = EndpointId::from("foo"); + let ep = EndpointIdInt::from(ep); + + let salt = [0x55; 32]; + let actual = pool + .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096)) + .await + .unwrap(); + + let expected = [ + 10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242, + 178, 43, 95, 8, 225, 182, 122, 40, 219, 21, 89, 147, 64, 140, + ]; + assert_eq!(actual, expected) + } +} diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 83a9773052..efa999ed7d 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -2,60 +2,72 @@ //! //! Handles both SQL over HTTP and SQL over Websockets. +mod backend; +pub mod cancel_set; mod conn_pool; +mod http_util; +mod json; mod sql_over_http; mod websocket; +use atomic_take::AtomicTake; +use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; -use anyhow::bail; -use hyper::StatusCode; -use metrics::IntCounterPairGuard; +use anyhow::Context; +use futures::future::{select, Either}; +use futures::TryFutureExt; +use http::{Method, Response, StatusCode}; +use http_body_util::Full; +use hyper1::body::Incoming; +use hyper_util::rt::TokioExecutor; +use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; +use tokio::time::timeout; +use tokio_rustls::{server::TlsStream, TlsAcceptor}; use tokio_util::task::TaskTracker; +use crate::cancellation::CancellationHandlerMain; +use crate::config::ProxyConfig; use crate::context::RequestMonitoring; -use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; -use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; +use crate::metrics::Metrics; +use crate::protocol2::{read_proxy_protocol, ChainRW}; +use crate::proxy::run_until_cancelled; use crate::rate_limiter::EndpointRateLimiter; -use crate::{cancellation::CancelMap, config::ProxyConfig}; -use futures::StreamExt; -use hyper::{ - server::{ - accept, - conn::{AddrIncoming, AddrStream}, - }, - Body, Method, Request, Response, -}; +use crate::serverless::backend::PoolingBackend; +use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::IpAddr; -use std::task::Poll; -use std::{future::ready, sync::Arc}; -use tls_listener::TlsListener; -use tokio::net::TcpListener; +use std::net::{IpAddr, SocketAddr}; +use std::pin::pin; +use std::sync::Arc; +use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; -use utils::http::{error::ApiError, json::json_response}; +use tracing::{error, info, warn, Instrument}; +use utils::http::error::ApiError; + +pub const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( config: &'static ProxyConfig, ws_listener: TcpListener, cancellation_token: CancellationToken, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); } - let conn_pool = conn_pool::GlobalConnPool::new(config); - - let conn_pool2 = Arc::clone(&conn_pool); - tokio::spawn(async move { - conn_pool2.gc_worker(StdRng::from_entropy()).await; - }); + let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config); + { + let conn_pool = Arc::clone(&conn_pool); + tokio::spawn(async move { + conn_pool.gc_worker(StdRng::from_entropy()).await; + }); + } // shutdown the connection pool tokio::spawn({ @@ -69,140 +81,243 @@ pub async fn task_main( } }); - let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config()); - let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config { - Some(config) => config.into(), + let backend = Arc::new(PoolingBackend { + pool: Arc::clone(&conn_pool), + config, + endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), + }); + + let tls_config = match config.tls_config.as_ref() { + Some(config) => config, None => { warn!("TLS config is missing, WebSocket Secure server will not be started"); return Ok(()); } }; + let mut tls_server_config = rustls::ServerConfig::clone(&tls_config.to_server_config()); + // prefer http2, but support http/1.1 + tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; + let tls_acceptor: tokio_rustls::TlsAcceptor = Arc::new(tls_server_config).into(); - let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?; - let _ = addr_incoming.set_nodelay(true); - let addr_incoming = ProxyProtocolAccept { - incoming: addr_incoming, - }; + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + connections.close(); // allows `connections.wait to complete` - let ws_connections = tokio_util::task::task_tracker::TaskTracker::new(); - ws_connections.close(); // allows `ws_connections.wait to complete` - - let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| { - if let Err(err) = conn { - error!("failed to accept TLS connection for websockets: {err:?}"); - ready(false) - } else { - ready(true) + while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { + let (conn, peer_addr) = res.context("could not accept TCP stream")?; + if let Err(e) = conn.set_nodelay(true) { + tracing::error!("could not set nodelay: {e}"); + continue; } - }); + let conn_id = uuid::Uuid::new_v4(); + let http_conn_span = tracing::info_span!("http_conn", ?conn_id); - let make_svc = hyper::service::make_service_fn( - |stream: &tokio_rustls::server::TlsStream>| { - let (io, tls) = stream.get_ref(); - let client_addr = io.client_addr(); - let remote_addr = io.inner.remote_addr(); - let sni_name = tls.server_name().map(|s| s.to_string()); - let conn_pool = conn_pool.clone(); - let ws_connections = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - - async move { - let peer_addr = match client_addr { - Some(addr) => addr, - None if config.require_client_ip => bail!("missing required client ip"), - None => remote_addr, - }; - Ok(MetricService::new(hyper::service::service_fn( - move |req: Request| { - let sni_name = sni_name.clone(); - let conn_pool = conn_pool.clone(); - let ws_connections = ws_connections.clone(); - let endpoint_rate_limiter = endpoint_rate_limiter.clone(); - - async move { - let cancel_map = Arc::new(CancelMap::default()); - let session_id = uuid::Uuid::new_v4(); - - request_handler( - req, - config, - conn_pool, - ws_connections, - cancel_map, - session_id, - sni_name, - peer_addr.ip(), - endpoint_rate_limiter, - ) - .instrument(info_span!( - "serverless", - session = %session_id, - %peer_addr, - )) - .await - } - }, - ))) + let n_connections = Metrics::get() + .proxy + .client_connections + .sample(crate::metrics::Protocol::Http); + tracing::trace!(?n_connections, threshold = ?config.http_config.client_conn_threshold, "check"); + if n_connections > config.http_config.client_conn_threshold { + tracing::trace!("attempting to cancel a random connection"); + if let Some(token) = config.http_config.cancel_set.take() { + tracing::debug!("cancelling a random connection"); + token.cancel() } - }, - ); + } - hyper::Server::builder(accept::from_stream(tls_listener)) - .serve(make_svc) - .with_graceful_shutdown(cancellation_token.cancelled()) - .await?; + let conn_token = cancellation_token.child_token(); + let tls_acceptor = tls_acceptor.clone(); + let backend = backend.clone(); + let connections2 = connections.clone(); + let cancellation_handler = cancellation_handler.clone(); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + connections.spawn( + async move { + let conn_token2 = conn_token.clone(); + let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token2); - // await websocket connections - ws_connections.wait().await; + let session_id = uuid::Uuid::new_v4(); + + let _gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Http); + + let startup_result = Box::pin(connection_startup( + config, + tls_acceptor, + session_id, + conn, + peer_addr, + )) + .await; + let Some((conn, peer_addr)) = startup_result else { + return; + }; + + Box::pin(connection_handler( + config, + backend, + connections2, + cancellation_handler, + endpoint_rate_limiter, + conn_token, + conn, + peer_addr, + session_id, + )) + .await; + } + .instrument(http_conn_span), + ); + } + + connections.wait().await; Ok(()) } -struct MetricService { - inner: S, - _gauge: IntCounterPairGuard, -} - -impl MetricService { - fn new(inner: S) -> MetricService { - MetricService { - inner, - _gauge: NUM_CLIENT_CONNECTION_GAUGE - .with_label_values(&["http"]) - .guard(), +/// Handles the TCP startup lifecycle. +/// 1. Parses PROXY protocol V2 +/// 2. Handles TLS handshake +async fn connection_startup( + config: &ProxyConfig, + tls_acceptor: TlsAcceptor, + session_id: uuid::Uuid, + conn: TcpStream, + peer_addr: SocketAddr, +) -> Option<(TlsStream>, IpAddr)> { + // handle PROXY protocol + let (conn, peer) = match read_proxy_protocol(conn).await { + Ok(c) => c, + Err(e) => { + tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + return None; } - } + }; + + let peer_addr = peer.unwrap_or(peer_addr).ip(); + let has_private_peer_addr = match peer_addr { + IpAddr::V4(ip) => ip.is_private(), + _ => false, + }; + info!(?session_id, %peer_addr, "accepted new TCP connection"); + + // try upgrade to TLS, but with a timeout. + let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await { + Ok(Ok(conn)) => { + info!(?session_id, %peer_addr, "accepted new TLS connection"); + conn + } + // The handshake failed + Ok(Err(e)) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return None; + } + // The handshake timed out + Err(e) => { + if !has_private_peer_addr { + Metrics::get().proxy.tls_handshake_failures.inc(); + } + warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}"); + return None; + } + }; + + Some((conn, peer_addr)) } -impl hyper::service::Service> for MetricService -where - S: hyper::service::Service>, -{ - type Response = S::Response; - type Error = S::Error; - type Future = S::Future; +/// Handles HTTP connection +/// 1. With graceful shutdowns +/// 2. With graceful request cancellation with connection failure +/// 3. With websocket upgrade support. +#[allow(clippy::too_many_arguments)] +async fn connection_handler( + config: &'static ProxyConfig, + backend: Arc, + connections: TaskTracker, + cancellation_handler: Arc, + endpoint_rate_limiter: Arc, + cancellation_token: CancellationToken, + conn: TlsStream>, + peer_addr: IpAddr, + session_id: uuid::Uuid, +) { + let session_id = AtomicTake::new(session_id); - fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { - self.inner.poll_ready(cx) - } + // Cancel all current inflight HTTP requests if the HTTP connection is closed. + let http_cancellation_token = CancellationToken::new(); + let _cancel_connection = http_cancellation_token.clone().drop_guard(); - fn call(&mut self, req: Request) -> Self::Future { - self.inner.call(req) + let server = Builder::new(TokioExecutor::new()); + let conn = server.serve_connection_with_upgrades( + hyper_util::rt::TokioIo::new(conn), + hyper1::service::service_fn(move |req: hyper1::Request| { + // First HTTP request shares the same session ID + let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4); + + // Cancel the current inflight HTTP request if the requets stream is closed. + // This is slightly different to `_cancel_connection` in that + // h2 can cancel individual requests with a `RST_STREAM`. + let http_request_token = http_cancellation_token.child_token(); + let cancel_request = http_request_token.clone().drop_guard(); + + // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. + // By spawning the future, we ensure it never gets cancelled until it decides to. + let handler = connections.spawn( + request_handler( + req, + config, + backend.clone(), + connections.clone(), + cancellation_handler.clone(), + session_id, + peer_addr, + http_request_token, + endpoint_rate_limiter.clone(), + ) + .in_current_span() + .map_ok_or_else(api_error_into_response, |r| r), + ); + async move { + let res = handler.await; + cancel_request.disarm(); + res + } + }), + ); + + // On cancellation, trigger the HTTP connection handler to shut down. + let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await { + Either::Left((_cancelled, mut conn)) => { + tracing::debug!(%peer_addr, "cancelling connection"); + conn.as_mut().graceful_shutdown(); + conn.await + } + Either::Right((res, _)) => res, + }; + + match res { + Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"), + Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"), } } #[allow(clippy::too_many_arguments)] async fn request_handler( - mut request: Request, + mut request: hyper1::Request, config: &'static ProxyConfig, - conn_pool: Arc, + backend: Arc, ws_connections: TaskTracker, - cancel_map: Arc, + cancellation_handler: Arc, session_id: uuid::Uuid, - sni_hostname: Option, peer_addr: IpAddr, + // used to cancel in-flight HTTP requests. not used to cancel websockets + http_cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, -) -> Result, ApiError> { +) -> Result>, ApiError> { let host = request .headers() .get("host") @@ -211,46 +326,53 @@ async fn request_handler( .map(|s| s.to_string()); // Check if the request is a websocket upgrade request. - if hyper_tungstenite::is_upgrade_request(&request) { - info!(session_id = ?session_id, "performing websocket upgrade"); + if framed_websockets::upgrade::is_upgrade_request(&request) { + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Ws, + &config.region, + ); - let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) + let span = ctx.span.clone(); + info!(parent: &span, "performing websocket upgrade"); + + let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) .map_err(|e| ApiError::BadRequest(e.into()))?; ws_connections.spawn( async move { - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region); - if let Err(e) = websocket::serve_websocket( config, - &mut ctx, + ctx, websocket, - &cancel_map, - host, + cancellation_handler, endpoint_rate_limiter, + host, ) .await { - error!(session_id = ?session_id, "error in websocket connection: {e:#}"); + error!("error in websocket connection: {e:#}"); } } - .in_current_span(), + .instrument(span), ); // Return the response so the spawned future can continue. - Ok(response) - } else if request.uri().path() == "/sql" && request.method() == Method::POST { - let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region); + Ok(response.map(|_: http_body_util::Empty| Full::new(Bytes::new()))) + } else if request.uri().path() == "/sql" && *request.method() == Method::POST { + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Http, + &config.region, + ); + let span = ctx.span.clone(); - sql_over_http::handle( - &config.http_config, - &mut ctx, - request, - sni_hostname, - conn_pool, - ) - .await - } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { + sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) + .instrument(span) + .await + } else if request.uri().path() == "/sql" && *request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") .header("Access-Control-Allow-Origin", "*") @@ -260,7 +382,7 @@ async fn request_handler( ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code - .body(Body::empty()) + .body(Full::new(Bytes::new())) .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs new file mode 100644 index 0000000000..86e64c0a38 --- /dev/null +++ b/proxy/src/serverless/backend.rs @@ -0,0 +1,252 @@ +use std::{sync::Arc, time::Duration}; + +use async_trait::async_trait; +use tracing::{field::display, info}; + +use crate::{ + auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError}, + compute, + config::{AuthenticationConfig, ProxyConfig}, + console::{ + errors::{GetAuthInfoError, WakeComputeError}, + locks::ApiLocks, + provider::ApiLockError, + CachedNodeInfo, + }, + context::RequestMonitoring, + error::{ErrorKind, ReportableError, UserFacingError}, + intern::EndpointIdInt, + proxy::{connect_compute::ConnectMechanism, retry::ShouldRetry}, + rate_limiter::EndpointRateLimiter, + Host, +}; + +use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; + +pub struct PoolingBackend { + pub pool: Arc>, + pub config: &'static ProxyConfig, + pub endpoint_rate_limiter: Arc, +} + +impl PoolingBackend { + pub async fn authenticate( + &self, + ctx: &mut RequestMonitoring, + config: &AuthenticationConfig, + conn_info: &ConnInfo, + ) -> Result { + let user_info = conn_info.user_info.clone(); + let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone()); + let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; + if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { + return Err(AuthError::ip_address_not_allowed(ctx.peer_addr)); + } + if !self + .endpoint_rate_limiter + .check(conn_info.user_info.endpoint.clone().into(), 1) + { + return Err(AuthError::too_many_connections()); + } + let cached_secret = match maybe_secret { + Some(secret) => secret, + None => backend.get_role_secret(ctx).await?, + }; + + let secret = match cached_secret.value.clone() { + Some(secret) => self.config.authentication_config.check_rate_limit( + ctx, + config, + secret, + &user_info.endpoint, + true, + )?, + None => { + // If we don't have an authentication secret, for the http flow we can just return an error. + info!("authentication info not found"); + return Err(AuthError::auth_failed(&*user_info.user)); + } + }; + let ep = EndpointIdInt::from(&conn_info.user_info.endpoint); + let auth_outcome = crate::auth::validate_password_and_exchange( + &config.thread_pool, + ep, + &conn_info.password, + secret, + ) + .await?; + let res = match auth_outcome { + crate::sasl::Outcome::Success(key) => { + info!("user successfully authenticated"); + Ok(key) + } + crate::sasl::Outcome::Failure(reason) => { + info!("auth backend failed with an error: {reason}"); + Err(AuthError::auth_failed(&*conn_info.user_info.user)) + } + }; + res.map(|key| ComputeCredentials { + info: user_info, + keys: key, + }) + } + + // Wake up the destination if needed. Code here is a bit involved because + // we reuse the code from the usual proxy and we need to prepare few structures + // that this code expects. + #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + pub async fn connect_to_compute( + &self, + ctx: &mut RequestMonitoring, + conn_info: ConnInfo, + keys: ComputeCredentials, + force_new: bool, + ) -> Result, HttpConnError> { + let maybe_client = if !force_new { + info!("pool: looking for an existing connection"); + self.pool.get(ctx, &conn_info)? + } else { + info!("pool: pool is disabled"); + None + }; + + if let Some(client) = maybe_client { + return Ok(client); + } + let conn_id = uuid::Uuid::new_v4(); + tracing::Span::current().record("conn_id", display(conn_id)); + info!(%conn_id, "pool: opening a new connection '{conn_info}'"); + let backend = self.config.auth_backend.as_ref().map(|_| keys); + crate::proxy::connect_compute::connect_to_compute( + ctx, + &TokioMechanism { + conn_id, + conn_info, + pool: self.pool.clone(), + locks: &self.config.connect_compute_locks, + }, + &backend, + false, // do not allow self signed compute for http flow + self.config.wake_compute_retry_config, + self.config.connect_to_compute_retry_config, + ) + .await + } +} + +#[derive(Debug, thiserror::Error)] +pub enum HttpConnError { + #[error("pooled connection closed at inconsistent state")] + ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), + #[error("could not connection to compute")] + ConnectionError(#[from] tokio_postgres::Error), + + #[error("could not get auth info")] + GetAuthInfo(#[from] GetAuthInfoError), + #[error("user not authenticated")] + AuthError(#[from] AuthError), + #[error("wake_compute returned error")] + WakeCompute(#[from] WakeComputeError), + #[error("error acquiring resource permit: {0}")] + TooManyConnectionAttempts(#[from] ApiLockError), +} + +impl ReportableError for HttpConnError { + fn get_error_kind(&self) -> ErrorKind { + match self { + HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, + HttpConnError::ConnectionError(p) => p.get_error_kind(), + HttpConnError::GetAuthInfo(a) => a.get_error_kind(), + HttpConnError::AuthError(a) => a.get_error_kind(), + HttpConnError::WakeCompute(w) => w.get_error_kind(), + HttpConnError::TooManyConnectionAttempts(w) => w.get_error_kind(), + } + } +} + +impl UserFacingError for HttpConnError { + fn to_string_client(&self) -> String { + match self { + HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(), + HttpConnError::ConnectionError(p) => p.to_string(), + HttpConnError::GetAuthInfo(c) => c.to_string_client(), + HttpConnError::AuthError(c) => c.to_string_client(), + HttpConnError::WakeCompute(c) => c.to_string_client(), + HttpConnError::TooManyConnectionAttempts(_) => { + "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() + } + } + } +} + +impl ShouldRetry for HttpConnError { + fn could_retry(&self) -> bool { + match self { + HttpConnError::ConnectionError(e) => e.could_retry(), + HttpConnError::ConnectionClosedAbruptly(_) => false, + HttpConnError::GetAuthInfo(_) => false, + HttpConnError::AuthError(_) => false, + HttpConnError::WakeCompute(_) => false, + HttpConnError::TooManyConnectionAttempts(_) => false, + } + } + fn should_retry_database_address(&self) -> bool { + match self { + HttpConnError::ConnectionError(e) => e.should_retry_database_address(), + // we never checked cache validity + HttpConnError::TooManyConnectionAttempts(_) => false, + _ => true, + } + } +} + +struct TokioMechanism { + pool: Arc>, + conn_info: ConnInfo, + conn_id: uuid::Uuid, + + /// connect_to_compute concurrency lock + locks: &'static ApiLocks, +} + +#[async_trait] +impl ConnectMechanism for TokioMechanism { + type Connection = Client; + type ConnectError = HttpConnError; + type Error = HttpConnError; + + async fn connect_once( + &self, + ctx: &mut RequestMonitoring, + node_info: &CachedNodeInfo, + timeout: Duration, + ) -> Result { + let host = node_info.config.get_host()?; + let permit = self.locks.get_permit(&host).await?; + + let mut config = (*node_info.config).clone(); + let config = config + .user(&self.conn_info.user_info.user) + .password(&*self.conn_info.password) + .dbname(&self.conn_info.dbname) + .connect_timeout(timeout); + + let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute); + let res = config.connect(tokio_postgres::NoTls).await; + drop(pause); + let (client, connection) = permit.release_result(res)?; + + tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); + Ok(poll_client( + self.pool.clone(), + ctx, + self.conn_info.clone(), + client, + connection, + self.conn_id, + node_info.aux.clone(), + )) + } + + fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} +} diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs new file mode 100644 index 0000000000..390df7f4f7 --- /dev/null +++ b/proxy/src/serverless/cancel_set.rs @@ -0,0 +1,102 @@ +//! A set for cancelling random http connections + +use std::{ + hash::{BuildHasher, BuildHasherDefault}, + num::NonZeroUsize, + time::Duration, +}; + +use indexmap::IndexMap; +use parking_lot::Mutex; +use rand::{thread_rng, Rng}; +use rustc_hash::FxHasher; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use uuid::Uuid; + +type Hasher = BuildHasherDefault; + +pub struct CancelSet { + shards: Box<[Mutex]>, + // keyed by random uuid, fxhasher is fine + hasher: Hasher, +} + +pub struct CancelShard { + tokens: IndexMap, +} + +impl CancelSet { + pub fn new(shards: usize) -> Self { + CancelSet { + shards: (0..shards) + .map(|_| { + Mutex::new(CancelShard { + tokens: IndexMap::with_hasher(Hasher::default()), + }) + }) + .collect(), + hasher: Hasher::default(), + } + } + + pub fn take(&self) -> Option { + for _ in 0..4 { + if let Some(token) = self.take_raw(thread_rng().gen()) { + return Some(token); + } + tracing::trace!("failed to get cancel token"); + } + None + } + + pub fn take_raw(&self, rng: usize) -> Option { + NonZeroUsize::new(self.shards.len()) + .and_then(|len| self.shards[rng % len].lock().take(rng / len)) + } + + pub fn insert(&self, id: uuid::Uuid, token: CancellationToken) -> CancelGuard<'_> { + let shard = NonZeroUsize::new(self.shards.len()).map(|len| { + let hash = self.hasher.hash_one(id) as usize; + let shard = &self.shards[hash % len]; + shard.lock().insert(id, token); + shard + }); + CancelGuard { shard, id } + } +} + +impl CancelShard { + fn take(&mut self, rng: usize) -> Option { + NonZeroUsize::new(self.tokens.len()).and_then(|len| { + // 10 second grace period so we don't cancel new connections + if self.tokens.get_index(rng % len)?.1 .0.elapsed() < Duration::from_secs(10) { + return None; + } + + let (_key, (_insert, token)) = self.tokens.swap_remove_index(rng % len)?; + Some(token) + }) + } + + fn remove(&mut self, id: uuid::Uuid) { + self.tokens.swap_remove(&id); + } + + fn insert(&mut self, id: uuid::Uuid, token: CancellationToken) { + self.tokens.insert(id, (Instant::now(), token)); + } +} + +pub struct CancelGuard<'a> { + shard: Option<&'a Mutex>, + id: Uuid, +} + +impl Drop for CancelGuard<'_> { + fn drop(&mut self) { + if let Some(shard) = self.shard { + shard.lock().remove(self.id); + } + } +} diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index c9f3fd6a38..170bda062e 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,18 +1,8 @@ -use anyhow::{anyhow, Context}; -use async_trait::async_trait; use dashmap::DashMap; use futures::{future::poll_fn, Future}; -use metrics::{register_int_counter_pair, IntCounterPair, IntCounterPairGuard}; -use once_cell::sync::Lazy; use parking_lot::RwLock; -use pbkdf2::{ - password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString}, - Params, Pbkdf2, -}; -use pq_proto::StartupMessageParams; -use prometheus::{exponential_buckets, register_histogram, Histogram}; use rand::Rng; -use smol_str::SmolStr; +use smallvec::SmallVec; use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; use std::{ fmt, @@ -22,80 +12,109 @@ use std::{ ops::Deref, sync::atomic::{self, AtomicUsize}, }; -use tokio::time::{self, Instant}; -use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; +use tokio::time::Instant; +use tokio_postgres::tls::NoTlsStream; +use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; +use tokio_util::sync::CancellationToken; +use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{ - auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list}, - console, - context::RequestMonitoring, - metrics::NUM_DB_CONNECTIONS_GAUGE, - proxy::{connect_compute::ConnectMechanism, neon_options}, - usage_metrics::{Ids, MetricCounter, USAGE_METRICS}, + auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, }; -use crate::{compute, config}; use tracing::{debug, error, warn, Span}; use tracing::{info, info_span, Instrument}; -pub const APP_NAME: &str = "/sql_over_http"; +use super::backend::HttpConnError; #[derive(Debug, Clone)] pub struct ConnInfo { - pub username: SmolStr, - pub dbname: SmolStr, - pub hostname: SmolStr, - pub password: SmolStr, - pub options: Option, + pub user_info: ComputeUserInfo, + pub dbname: DbName, + pub password: SmallVec<[u8; 16]>, } impl ConnInfo { // hm, change to hasher to avoid cloning? - pub fn db_and_user(&self) -> (SmolStr, SmolStr) { - (self.dbname.clone(), self.username.clone()) + pub fn db_and_user(&self) -> (DbName, RoleName) { + (self.dbname.clone(), self.user_info.user.clone()) + } + + pub fn endpoint_cache_key(&self) -> Option { + // We don't want to cache http connections for ephemeral endpoints. + if self.user_info.options.is_ephemeral() { + None + } else { + Some(self.user_info.endpoint_cache_key()) + } } } impl fmt::Display for ConnInfo { // use custom display to avoid logging password fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}@{}/{}", self.username, self.hostname, self.dbname) + write!( + f, + "{}@{}/{}?{}", + self.user_info.user, + self.user_info.endpoint, + self.dbname, + self.user_info.options.get_cache_key("") + ) } } -struct ConnPoolEntry { - conn: ClientInner, +struct ConnPoolEntry { + conn: ClientInner, _last_access: std::time::Instant, } // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool // Number of open connections is limited by the `max_conns_per_endpoint`. -pub struct EndpointConnPool { - pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>, +pub struct EndpointConnPool { + pools: HashMap<(DbName, RoleName), DbUserConnPool>, total_conns: usize, max_conns: usize, - _guard: IntCounterPairGuard, + _guard: HttpEndpointPoolsGuard<'static>, + global_connections_count: Arc, + global_pool_size_max_conns: usize, } -impl EndpointConnPool { - fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option { +impl EndpointConnPool { + fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option> { let Self { - pools, total_conns, .. + pools, + total_conns, + global_connections_count, + .. } = self; - pools - .get_mut(&db_user) - .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns)) + pools.get_mut(&db_user).and_then(|pool_entries| { + pool_entries.get_conn_entry(total_conns, global_connections_count.clone()) + }) } - fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool { + fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool { let Self { - pools, total_conns, .. + pools, + total_conns, + global_connections_count, + .. } = self; if let Some(pool) = pools.get_mut(&db_user) { let old_len = pool.conns.len(); pool.conns.retain(|conn| conn.conn.conn_id != conn_id); let new_len = pool.conns.len(); let removed = old_len - new_len; + if removed > 0 { + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + } *total_conns -= removed; removed > 0 } else { @@ -103,12 +122,22 @@ impl EndpointConnPool { } } - fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> { + fn put(pool: &RwLock, conn_info: &ConnInfo, client: ClientInner) { let conn_id = client.conn_id; - if client.inner.is_closed() { + if client.is_closed() { info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed"); - return Ok(()); + return; + } + let global_max_conn = pool.read().global_pool_size_max_conns; + if pool + .read() + .global_connections_count + .load(atomic::Ordering::Relaxed) + >= global_max_conn + { + info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full"); + return; } // return connection to the pool @@ -118,18 +147,23 @@ impl EndpointConnPool { let mut pool = pool.write(); if pool.total_conns < pool.max_conns { - // we create this db-user entry in get, so it should not be None - if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) { - pool_entries.conns.push(ConnPoolEntry { - conn: client, - _last_access: std::time::Instant::now(), - }); + let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default(); + pool_entries.conns.push(ConnPoolEntry { + conn: client, + _last_access: std::time::Instant::now(), + }); - returned = true; - per_db_size = pool_entries.conns.len(); + returned = true; + per_db_size = pool_entries.conns.len(); - pool.total_conns += 1; - } + pool.total_conns += 1; + pool.global_connections_count + .fetch_add(1, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .inc(); } pool.total_conns @@ -141,54 +175,72 @@ impl EndpointConnPool { } else { info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}"); } - - Ok(()) } } -/// 4096 is the number of rounds that SCRAM-SHA-256 recommends. -/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway. -/// -/// Still takes 1.4ms to hash on my hardware. -/// We don't want to ruin the latency improvements of using the pool by making password verification take too long -const PARAMS: Params = Params { - rounds: 4096, - output_length: 32, -}; - -#[derive(Default)] -pub struct DbUserConnPool { - conns: Vec, - password_hash: Option, +impl Drop for EndpointConnPool { + fn drop(&mut self) { + if self.total_conns > 0 { + self.global_connections_count + .fetch_sub(self.total_conns, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.total_conns as i64); + } + } } -impl DbUserConnPool { - fn clear_closed_clients(&mut self, conns: &mut usize) { +pub struct DbUserConnPool { + conns: Vec>, +} + +impl Default for DbUserConnPool { + fn default() -> Self { + Self { conns: Vec::new() } + } +} + +impl DbUserConnPool { + fn clear_closed_clients(&mut self, conns: &mut usize) -> usize { let old_len = self.conns.len(); - self.conns.retain(|conn| !conn.conn.inner.is_closed()); + self.conns.retain(|conn| !conn.conn.is_closed()); let new_len = self.conns.len(); let removed = old_len - new_len; *conns -= removed; + removed } - fn get_conn_entry(&mut self, conns: &mut usize) -> Option { - self.clear_closed_clients(conns); + fn get_conn_entry( + &mut self, + conns: &mut usize, + global_connections_count: Arc, + ) -> Option> { + let mut removed = self.clear_closed_clients(conns); let conn = self.conns.pop(); if conn.is_some() { *conns -= 1; + removed += 1; } + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); conn } } -pub struct GlobalConnPool { +pub struct GlobalConnPool { // endpoint -> per-endpoint connection pool // // That should be a fairly conteded map, so return reference to the per-endpoint // pool as early as possible and release the lock. - global_pool: DashMap>>, + global_pool: DashMap>>>, /// Number of endpoint-connection pools /// @@ -197,7 +249,10 @@ pub struct GlobalConnPool { /// It's only used for diagnostics. global_pool_size: AtomicUsize, - proxy_config: &'static crate::config::ProxyConfig, + /// Total number of connections in the pool + global_connections_count: Arc, + + config: &'static crate::config::HttpConfig, } #[derive(Debug, Clone, Copy)] @@ -215,45 +270,39 @@ pub struct GlobalConnPoolOptions { pub idle_timeout: Duration, pub opt_in: bool, + + // Total number of connections in the pool. + pub max_total_conns: usize, } -pub static GC_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_http_pool_reclaimation_lag_seconds", - "Time it takes to reclaim unused connection pools", - // 1us -> 65ms - exponential_buckets(1e-6, 2.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static ENDPOINT_POOLS: Lazy = Lazy::new(|| { - register_int_counter_pair!( - "proxy_http_pool_endpoints_registered_total", - "Number of endpoints we have registered pools for", - "proxy_http_pool_endpoints_unregistered_total", - "Number of endpoints we have unregistered pools for", - ) - .unwrap() -}); - -impl GlobalConnPool { - pub fn new(config: &'static crate::config::ProxyConfig) -> Arc { - let shards = config.http_config.pool_options.pool_shards; +impl GlobalConnPool { + pub fn new(config: &'static crate::config::HttpConfig) -> Arc { + let shards = config.pool_options.pool_shards; Arc::new(Self { global_pool: DashMap::with_shard_amount(shards), global_pool_size: AtomicUsize::new(0), - proxy_config: config, + config, + global_connections_count: Arc::new(AtomicUsize::new(0)), }) } + #[cfg(test)] + pub fn get_global_connections_count(&self) -> usize { + self.global_connections_count + .load(atomic::Ordering::Relaxed) + } + + pub fn get_idle_timeout(&self) -> Duration { + self.config.pool_options.idle_timeout + } + pub fn shutdown(&self) { // drops all strong references to endpoint-pools self.global_pool.clear(); } pub async fn gc_worker(&self, mut rng: impl Rng) { - let epoch = self.proxy_config.http_config.pool_options.gc_epoch; + let epoch = self.config.pool_options.gc_epoch; let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); loop { interval.tick().await; @@ -269,8 +318,12 @@ impl GlobalConnPool { // acquire a random shard lock let mut shard = self.global_pool.shards()[shard].write(); - let timer = GC_LATENCY.start_timer(); + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); let current_len = shard.len(); + let mut clients_removed = 0; shard.retain(|endpoint, x| { // if the current endpoint pool is unique (no other strong or weak references) // then it is currently not in use by any connections. @@ -280,9 +333,9 @@ impl GlobalConnPool { } = pool.get_mut(); // ensure that closed clients are removed - pools - .iter_mut() - .for_each(|(_, db_pool)| db_pool.clear_closed_clients(total_conns)); + pools.iter_mut().for_each(|(_, db_pool)| { + clients_removed += db_pool.clear_closed_clients(total_conns); + }); // we only remove this pool if it has no active connections if *total_conns == 0 { @@ -293,10 +346,24 @@ impl GlobalConnPool { true }); + let new_len = shard.len(); drop(shard); - timer.observe_duration(); + timer.observe(); + // Do logging outside of the lock. + if clients_removed > 0 { + let size = self + .global_connections_count + .fetch_sub(clients_removed, atomic::Ordering::Relaxed) + - clients_removed; + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); + info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + } let removed = current_len - new_len; if removed > 0 { @@ -308,130 +375,53 @@ impl GlobalConnPool { } } - pub async fn get( + pub fn get( self: &Arc, ctx: &mut RequestMonitoring, - conn_info: ConnInfo, - force_new: bool, - ) -> anyhow::Result { - let mut client: Option = None; + conn_info: &ConnInfo, + ) -> Result>, HttpConnError> { + let mut client: Option> = None; + let Some(endpoint) = conn_info.endpoint_cache_key() else { + return Ok(None); + }; - let mut hash_valid = false; - let mut endpoint_pool = Weak::new(); - if !force_new { - let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); - endpoint_pool = Arc::downgrade(&pool); - let mut hash = None; - - // find a pool entry by (dbname, username) if exists - { - let pool = pool.read(); - if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) { - if !pool_entries.conns.is_empty() { - hash = pool_entries.password_hash.clone(); - } - } - } - - // a connection exists in the pool, verify the password hash - if let Some(hash) = hash { - let pw = conn_info.password.clone(); - let validate = tokio::task::spawn_blocking(move || { - Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash()) - }) - .await?; - - // if the hash is invalid, don't error - // we will continue with the regular connection flow - if validate.is_ok() { - hash_valid = true; - if let Some(entry) = pool.write().get_conn_entry(conn_info.db_and_user()) { - client = Some(entry.conn) - } - } - } + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + if let Some(entry) = endpoint_pool + .write() + .get_conn_entry(conn_info.db_and_user()) + { + client = Some(entry.conn) } + let endpoint_pool = Arc::downgrade(&endpoint_pool); // ok return cached connection if found and establish a new one otherwise - let new_client = if let Some(client) = client { - if client.inner.is_closed() { - let conn_id = uuid::Uuid::new_v4(); - info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one"); - connect_to_compute( - self.proxy_config, - ctx, - &conn_info, - conn_id, - endpoint_pool.clone(), - ) - .await + if let Some(client) = client { + if client.is_closed() { + info!("pool: cached connection '{conn_info}' is closed, opening a new one"); + return Ok(None); } else { - info!("pool: reusing connection '{conn_info}'"); - client.session.send(ctx.session_id)?; + tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); tracing::Span::current().record( "pid", &tracing::field::display(client.inner.get_process_id()), ); - ctx.latency_timer.pool_hit(); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + client.session.send(ctx.session_id)?; + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); ctx.latency_timer.success(); - return Ok(Client::new(client, conn_info, endpoint_pool).await); + return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } - } else { - let conn_id = uuid::Uuid::new_v4(); - info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - connect_to_compute( - self.proxy_config, - ctx, - &conn_info, - conn_id, - endpoint_pool.clone(), - ) - .await - }; - if let Ok(client) = &new_client { - tracing::Span::current().record( - "pid", - &tracing::field::display(client.inner.get_process_id()), - ); } - - match &new_client { - // clear the hash. it's no longer valid - // TODO: update tokio-postgres fork to allow access to this error kind directly - Err(err) - if hash_valid && err.to_string().contains("password authentication failed") => - { - let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); - let mut pool = pool.write(); - if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) { - entry.password_hash = None; - } - } - // new password is valid and we should insert/update it - Ok(_) if !force_new && !hash_valid => { - let pw = conn_info.password.clone(); - let new_hash = tokio::task::spawn_blocking(move || { - let salt = SaltString::generate(rand::rngs::OsRng); - Pbkdf2 - .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt) - .map(|s| s.serialize()) - }) - .await??; - - let pool = self.get_or_create_endpoint_pool(&conn_info.hostname); - let mut pool = pool.write(); - pool.pools - .entry(conn_info.db_and_user()) - .or_default() - .password_hash = Some(new_hash); - } - _ => {} - } - let new_client = new_client?; - Ok(Client::new(new_client, conn_info, endpoint_pool).await) + Ok(None) } - fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc> { + fn get_or_create_endpoint_pool( + self: &Arc, + endpoint: &EndpointCacheKey, + ) -> Arc>> { // fast path if let Some(pool) = self.global_pool.get(endpoint) { return pool.clone(); @@ -441,12 +431,10 @@ impl GlobalConnPool { let new_pool = Arc::new(RwLock::new(EndpointConnPool { pools: HashMap::new(), total_conns: 0, - max_conns: self - .proxy_config - .http_config - .pool_options - .max_conns_per_endpoint, - _guard: ENDPOINT_POOLS.guard(), + max_conns: self.config.pool_options.max_conns_per_endpoint, + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), + global_connections_count: self.global_connections_count.clone(), + global_pool_size_max_conns: self.config.pool_options.max_total_conns, })); // find or create a pool for this endpoint @@ -475,295 +463,218 @@ impl GlobalConnPool { } } -struct TokioMechanism<'a> { - pool: Weak>, - conn_info: &'a ConnInfo, - conn_id: uuid::Uuid, - idle: Duration, -} - -#[async_trait] -impl ConnectMechanism for TokioMechanism<'_> { - type Connection = ClientInner; - type ConnectError = tokio_postgres::Error; - type Error = anyhow::Error; - - async fn connect_once( - &self, - ctx: &mut RequestMonitoring, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, - ) -> Result { - connect_to_compute_once( - ctx, - node_info, - self.conn_info, - timeout, - self.conn_id, - self.pool.clone(), - self.idle, - ) - .await - } - - fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} -} - -// Wake up the destination if needed. Code here is a bit involved because -// we reuse the code from the usual proxy and we need to prepare few structures -// that this code expects. -#[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] -async fn connect_to_compute( - config: &config::ProxyConfig, +pub fn poll_client( + global_pool: Arc>, ctx: &mut RequestMonitoring, - conn_info: &ConnInfo, + conn_info: ConnInfo, + client: C, + mut connection: tokio_postgres::Connection, conn_id: uuid::Uuid, - pool: Weak>, -) -> anyhow::Result { - let tls = config.tls_config.as_ref(); - let common_names = tls.and_then(|tls| tls.common_names.clone()); - - let params = StartupMessageParams::new([ - ("user", &conn_info.username), - ("database", &conn_info.dbname), - ("application_name", APP_NAME), - ("options", conn_info.options.as_deref().unwrap_or("")), - ]); - let creds = - auth::ClientCredentials::parse(ctx, ¶ms, Some(&conn_info.hostname), common_names)?; - - let creds = - ComputeUserInfo::try_from(creds).map_err(|_| anyhow!("missing endpoint identifier"))?; - let backend = config.auth_backend.as_ref().map(|_| creds); - - let console_options = neon_options(¶ms); - - if !config.disable_ip_check_for_http { - let allowed_ips = backend.get_allowed_ips(ctx).await?; - if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) { - return Err(auth::AuthError::ip_address_not_allowed().into()); - } - } - let extra = console::ConsoleReqExtra { - options: console_options, - }; - let node_info = backend - .wake_compute(ctx, &extra) - .await? - .context("missing cache entry from wake_compute")?; - - ctx.set_project(node_info.aux.clone()); - - crate::proxy::connect_compute::connect_to_compute( - ctx, - &TokioMechanism { - conn_id, - conn_info, - pool, - idle: config.http_config.pool_options.idle_timeout, - }, - node_info, - &extra, - &backend, - ) - .await -} - -async fn connect_to_compute_once( - ctx: &mut RequestMonitoring, - node_info: &console::CachedNodeInfo, - conn_info: &ConnInfo, - timeout: time::Duration, - conn_id: uuid::Uuid, - pool: Weak>, - idle: Duration, -) -> Result { - let mut config = (*node_info.config).clone(); - let mut session = ctx.session_id; - - let (client, mut connection) = config - .user(&conn_info.username) - .password(&*conn_info.password) - .dbname(&conn_info.dbname) - .connect_timeout(timeout) - .connect(tokio_postgres::NoTls) - .await?; - - let conn_gauge = NUM_DB_CONNECTIONS_GAUGE - .with_label_values(&[ctx.protocol]) - .guard(); - - tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id())); - - let (tx, mut rx) = tokio::sync::watch::channel(session); + aux: MetricsAuxInfo, +) -> Client { + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol); + let mut session_id = ctx.session_id; + let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); + let cold_start_info = ctx.cold_start_info; span.in_scope(|| { - info!(%conn_info, %session, "new connection"); + info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); - let ids = Ids { - endpoint_id: node_info.aux.endpoint_id.clone(), - branch_id: node_info.aux.branch_id.clone(), + let pool = match conn_info.endpoint_cache_key() { + Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)), + None => Weak::new(), }; + let pool_clone = pool.clone(); let db_user = conn_info.db_and_user(); + let idle = global_pool.get_idle_timeout(); + let cancel = CancellationToken::new(); + let cancelled = cancel.clone().cancelled_owned(); + tokio::spawn( - async move { - let _conn_gauge = conn_gauge; - let mut idle_timeout = pin!(tokio::time::sleep(idle)); - poll_fn(move |cx| { - if matches!(rx.has_changed(), Ok(true)) { - session = *rx.borrow_and_update(); - info!(%session, "changed session"); + async move { + let _conn_gauge = conn_gauge; + let mut idle_timeout = pin!(tokio::time::sleep(idle)); + let mut cancelled = pin!(cancelled); + + poll_fn(move |cx| { + if cancelled.as_mut().poll(cx).is_ready() { + info!("connection dropped"); + return Poll::Ready(()) + } + + match rx.has_changed() { + Ok(true) => { + session_id = *rx.borrow_and_update(); + info!(%session_id, "changed session"); idle_timeout.as_mut().reset(Instant::now() + idle); } - - // 5 minute idle connection timeout - if idle_timeout.as_mut().poll(cx).is_ready() { - idle_timeout.as_mut().reset(Instant::now() + idle); - info!("connection idle"); - if let Some(pool) = pool.clone().upgrade() { - // remove client from pool - should close the connection if it's idle. - // does nothing if the client is currently checked-out and in-use - if pool.write().remove_client(db_user.clone(), conn_id) { - info!("idle connection removed"); - } - } + Err(_) => { + info!("connection dropped"); + return Poll::Ready(()) } + _ => {} + } - loop { - let message = ready!(connection.poll_message(cx)); - - match message { - Some(Ok(AsyncMessage::Notice(notice))) => { - info!(%session, "notice: {}", notice); - } - Some(Ok(AsyncMessage::Notification(notif))) => { - warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received"); - } - Some(Ok(_)) => { - warn!(%session, "unknown message"); - } - Some(Err(e)) => { - error!(%session, "connection error: {}", e); - break - } - None => { - info!("connection closed"); - break - } - } - } - - // remove from connection pool + // 5 minute idle connection timeout + if idle_timeout.as_mut().poll(cx).is_ready() { + idle_timeout.as_mut().reset(Instant::now() + idle); + info!("connection idle"); if let Some(pool) = pool.clone().upgrade() { + // remove client from pool - should close the connection if it's idle. + // does nothing if the client is currently checked-out and in-use if pool.write().remove_client(db_user.clone(), conn_id) { - info!("closed connection removed"); + info!("idle connection removed"); } } + } - Poll::Ready(()) - }).await; + loop { + let message = ready!(connection.poll_message(cx)); - } - .instrument(span) - ); + match message { + Some(Ok(AsyncMessage::Notice(notice))) => { + info!(%session_id, "notice: {}", notice); + } + Some(Ok(AsyncMessage::Notification(notif))) => { + warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received"); + } + Some(Ok(_)) => { + warn!(%session_id, "unknown message"); + } + Some(Err(e)) => { + error!(%session_id, "connection error: {}", e); + break + } + None => { + info!("connection closed"); + break + } + } + } - Ok(ClientInner { + // remove from connection pool + if let Some(pool) = pool.clone().upgrade() { + if pool.write().remove_client(db_user.clone(), conn_id) { + info!("closed connection removed"); + } + } + + Poll::Ready(()) + }).await; + + } + .instrument(span)); + let inner = ClientInner { inner: client, session: tx, - ids, + cancel, + aux, conn_id, - }) + }; + Client::new(inner, conn_info, pool_clone) } -struct ClientInner { - inner: tokio_postgres::Client, +struct ClientInner { + inner: C, session: tokio::sync::watch::Sender, - ids: Ids, + cancel: CancellationToken, + aux: MetricsAuxInfo, conn_id: uuid::Uuid, } -impl Client { - pub fn metrics(&self) -> Arc { - USAGE_METRICS.register(self.inner.as_ref().unwrap().ids.clone()) +impl Drop for ClientInner { + fn drop(&mut self) { + // on client drop, tell the conn to shut down + self.cancel.cancel(); } } -pub struct Client { - conn_id: uuid::Uuid, +pub trait ClientInnerExt: Sync + Send + 'static { + fn is_closed(&self) -> bool; + fn get_process_id(&self) -> i32; +} + +impl ClientInnerExt for tokio_postgres::Client { + fn is_closed(&self) -> bool { + self.is_closed() + } + fn get_process_id(&self) -> i32 { + self.get_process_id() + } +} + +impl ClientInner { + pub fn is_closed(&self) -> bool { + self.inner.is_closed() + } +} + +impl Client { + pub fn metrics(&self) -> Arc { + let aux = &self.inner.as_ref().unwrap().aux; + USAGE_METRICS.register(Ids { + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, + }) + } +} + +pub struct Client { span: Span, - inner: Option, + inner: Option>, conn_info: ConnInfo, - pool: Weak>, + pool: Weak>>, } -pub struct Discard<'a> { - conn_id: uuid::Uuid, +pub struct Discard<'a, C: ClientInnerExt> { conn_info: &'a ConnInfo, - pool: &'a mut Weak>, + pool: &'a mut Weak>>, } -impl Client { - pub(self) async fn new( - inner: ClientInner, +impl Client { + pub(self) fn new( + inner: ClientInner, conn_info: ConnInfo, - pool: Weak>, + pool: Weak>>, ) -> Self { Self { - conn_id: inner.conn_id, inner: Some(inner), span: Span::current(), conn_info, pool, } } - pub fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) { + pub fn inner(&mut self) -> (&mut C, Discard<'_, C>) { let Self { inner, pool, - conn_id, conn_info, span: _, } = self; - ( - &mut inner - .as_mut() - .expect("client inner should not be removed") - .inner, - Discard { - pool, - conn_info, - conn_id: *conn_id, - }, - ) - } - - pub fn check_idle(&mut self, status: ReadyForQueryStatus) { - self.inner().1.check_idle(status) - } - pub fn discard(&mut self) { - self.inner().1.discard() + let inner = inner.as_mut().expect("client inner should not be removed"); + (&mut inner.inner, Discard { pool, conn_info }) } } -impl Discard<'_> { +impl Discard<'_, C> { pub fn check_idle(&mut self, status: ReadyForQueryStatus) { let conn_info = &self.conn_info; if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 { - info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle") + info!("pool: throwing away connection '{conn_info}' because connection is not idle") } } pub fn discard(&mut self) { let conn_info = &self.conn_info; if std::mem::take(self.pool).strong_count() > 0 { - info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state") + info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state") } } } -impl Deref for Client { - type Target = tokio_postgres::Client; +impl Deref for Client { + type Target = C; fn deref(&self) -> &Self::Target { &self @@ -774,8 +685,8 @@ impl Deref for Client { } } -impl Drop for Client { - fn drop(&mut self) { +impl Client { + fn do_drop(&mut self) -> Option { let conn_info = self.conn_info.clone(); let client = self .inner @@ -784,10 +695,172 @@ impl Drop for Client { if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() { let current_span = self.span.clone(); // return connection to the pool - tokio::task::spawn_blocking(move || { + return Some(move || { let _span = current_span.enter(); - let _ = EndpointConnPool::put(&conn_pool, &conn_info, client); + EndpointConnPool::put(&conn_pool, &conn_info, client); }); } + None + } +} + +impl Drop for Client { + fn drop(&mut self) { + if let Some(drop) = self.do_drop() { + tokio::task::spawn_blocking(drop); + } + } +} + +#[cfg(test)] +mod tests { + use std::{mem, sync::atomic::AtomicBool}; + + use crate::{serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId}; + + use super::*; + + struct MockClient(Arc); + impl MockClient { + fn new(is_closed: bool) -> Self { + MockClient(Arc::new(is_closed.into())) + } + } + impl ClientInnerExt for MockClient { + fn is_closed(&self) -> bool { + self.0.load(atomic::Ordering::Relaxed) + } + fn get_process_id(&self) -> i32 { + 0 + } + } + + fn create_inner() -> ClientInner { + create_inner_with(MockClient::new(false)) + } + + fn create_inner_with(client: MockClient) -> ClientInner { + ClientInner { + inner: client, + session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), + cancel: CancellationToken::new(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, + conn_id: uuid::Uuid::new_v4(), + } + } + + #[tokio::test] + async fn test_pool() { + let _ = env_logger::try_init(); + let config = Box::leak(Box::new(crate::config::HttpConfig { + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: 2, + gc_epoch: Duration::from_secs(1), + pool_shards: 2, + idle_timeout: Duration::from_secs(1), + opt_in: false, + max_total_conns: 3, + }, + request_timeout: Duration::from_secs(1), + cancel_set: CancelSet::new(0), + client_conn_threshold: u64::MAX, + })); + let pool = GlobalConnPool::new(config); + let conn_info = ConnInfo { + user_info: ComputeUserInfo { + user: "user".into(), + endpoint: "endpoint".into(), + options: Default::default(), + }, + dbname: "dbname".into(), + password: "password".as_bytes().into(), + }; + let ep_pool = Arc::downgrade( + &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), + ); + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + assert_eq!(0, pool.get_global_connections_count()); + client.inner().1.discard(); + // Discard should not add the connection from the pool. + assert_eq!(0, pool.get_global_connections_count()); + } + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + assert_eq!(1, pool.get_global_connections_count()); + } + { + let mut closed_client = Client::new( + create_inner_with(MockClient::new(true)), + conn_info.clone(), + ep_pool.clone(), + ); + closed_client.do_drop().unwrap()(); + mem::forget(closed_client); // drop the client + // The closed client shouldn't be added to the pool. + assert_eq!(1, pool.get_global_connections_count()); + } + let is_closed: Arc = Arc::new(false.into()); + { + let mut client = Client::new( + create_inner_with(MockClient(is_closed.clone())), + conn_info.clone(), + ep_pool.clone(), + ); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + + // The client should be added to the pool. + assert_eq!(2, pool.get_global_connections_count()); + } + { + let mut client = Client::new(create_inner(), conn_info, ep_pool); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + + // The client shouldn't be added to the pool. Because the ep-pool is full. + assert_eq!(2, pool.get_global_connections_count()); + } + + let conn_info = ConnInfo { + user_info: ComputeUserInfo { + user: "user".into(), + endpoint: "endpoint-2".into(), + options: Default::default(), + }, + dbname: "dbname".into(), + password: "password".as_bytes().into(), + }; + let ep_pool = Arc::downgrade( + &pool.get_or_create_endpoint_pool(&conn_info.endpoint_cache_key().unwrap()), + ); + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + assert_eq!(3, pool.get_global_connections_count()); + } + { + let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone()); + client.do_drop().unwrap()(); + mem::forget(client); // drop the client + + // The client shouldn't be added to the pool. Because the global pool is full. + assert_eq!(3, pool.get_global_connections_count()); + } + + is_closed.store(true, atomic::Ordering::Relaxed); + // Do gc for all shards. + pool.gc(0); + pool.gc(1); + // Closed client should be removed from the pool. + assert_eq!(2, pool.get_global_connections_count()); } } diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs new file mode 100644 index 0000000000..701ab58f63 --- /dev/null +++ b/proxy/src/serverless/http_util.rs @@ -0,0 +1,96 @@ +//! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility +//! Will merge back in at some point in the future. + +use bytes::Bytes; + +use anyhow::Context; +use http::{Response, StatusCode}; +use http_body_util::Full; + +use serde::Serialize; +use utils::http::error::ApiError; + +/// Like [`ApiError::into_response`] +pub fn api_error_into_response(this: ApiError) -> Response> { + match this { + ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( + format!("{err:#?}"), // use debug printing so that we give the cause + StatusCode::BAD_REQUEST, + ), + ApiError::Forbidden(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::FORBIDDEN) + } + ApiError::Unauthorized(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::UNAUTHORIZED) + } + ApiError::NotFound(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::NOT_FOUND) + } + ApiError::Conflict(_) => { + HttpErrorBody::response_from_msg_and_status(this.to_string(), StatusCode::CONFLICT) + } + ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::PRECONDITION_FAILED, + ), + ApiError::ShuttingDown => HttpErrorBody::response_from_msg_and_status( + "Shutting down".to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::SERVICE_UNAVAILABLE, + ), + ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::REQUEST_TIMEOUT, + ), + ApiError::Cancelled => HttpErrorBody::response_from_msg_and_status( + this.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), + ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::INTERNAL_SERVER_ERROR, + ), + } +} + +/// Same as [`utils::http::error::HttpErrorBody`] +#[derive(Serialize)] +struct HttpErrorBody { + pub msg: String, +} + +impl HttpErrorBody { + /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`] + fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response> { + HttpErrorBody { msg }.to_response(status) + } + + /// Same as [`utils::http::error::HttpErrorBody::to_response`] + fn to_response(&self, status: StatusCode) -> Response> { + Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + // we do not have nested maps with non string keys so serialization shouldn't fail + .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap()))) + .unwrap() + } +} + +/// Same as [`utils::http::json::json_response`] +pub fn json_response( + status: StatusCode, + data: T, +) -> Result>, ApiError> { + let json = serde_json::to_string(&data) + .context("Failed to serialize JSON response") + .map_err(ApiError::InternalServerError)?; + let response = Response::builder() + .status(status) + .header(http::header::CONTENT_TYPE, "application/json") + .body(Full::new(Bytes::from(json))) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) +} diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs new file mode 100644 index 0000000000..c22c63e85b --- /dev/null +++ b/proxy/src/serverless/json.rs @@ -0,0 +1,462 @@ +use serde_json::Map; +use serde_json::Value; +use tokio_postgres::types::Kind; +use tokio_postgres::types::Type; +use tokio_postgres::Row; + +// +// Convert json non-string types to strings, so that they can be passed to Postgres +// as parameters. +// +pub fn json_to_pg_text(json: Vec) -> Vec> { + json.iter().map(json_value_to_pg_text).collect() +} + +fn json_value_to_pg_text(value: &Value) -> Option { + match value { + // special care for nulls + Value::Null => None, + + // convert to text with escaping + v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()), + + // avoid escaping here, as we pass this as a parameter + Value::String(s) => Some(s.to_string()), + + // special care for arrays + Value::Array(_) => json_array_to_pg_array(value), + } +} + +// +// Serialize a JSON array to a Postgres array. Contrary to the strings in the params +// in the array we need to escape the strings. Postgres is okay with arrays of form +// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving +// it for Postgres to check. +// +// Example of the same escaping in node-postgres: packages/pg/lib/utils.js +// +fn json_array_to_pg_array(value: &Value) -> Option { + match value { + // special care for nulls + Value::Null => None, + + // convert to text with escaping + // here string needs to be escaped, as it is part of the array + v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()), + v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())), + + // recurse into array + Value::Array(arr) => { + let vals = arr + .iter() + .map(json_array_to_pg_array) + .map(|v| v.unwrap_or_else(|| "NULL".to_string())) + .collect::>() + .join(","); + + Some(format!("{{{}}}", vals)) + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum JsonConversionError { + #[error("internal error compute returned invalid data: {0}")] + AsTextError(tokio_postgres::Error), + #[error("parse int error: {0}")] + ParseIntError(#[from] std::num::ParseIntError), + #[error("parse float error: {0}")] + ParseFloatError(#[from] std::num::ParseFloatError), + #[error("parse json error: {0}")] + ParseJsonError(#[from] serde_json::Error), + #[error("unbalanced array")] + UnbalancedArray, +} + +// +// Convert postgres row with text-encoded values to JSON object +// +pub fn pg_text_row_to_json( + row: &Row, + columns: &[Type], + raw_output: bool, + array_mode: bool, +) -> Result { + let iter = row + .columns() + .iter() + .zip(columns) + .enumerate() + .map(|(i, (column, typ))| { + let name = column.name(); + let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?; + let json_value = if raw_output { + match pg_value { + Some(v) => Value::String(v.to_string()), + None => Value::Null, + } + } else { + pg_text_to_json(pg_value, typ)? + }; + Ok((name.to_string(), json_value)) + }); + + if array_mode { + // drop keys and aggregate into array + let arr = iter + .map(|r| r.map(|(_key, val)| val)) + .collect::, JsonConversionError>>()?; + Ok(Value::Array(arr)) + } else { + let obj = iter.collect::, JsonConversionError>>()?; + Ok(Value::Object(obj)) + } +} + +// +// Convert postgres text-encoded value to JSON value +// +fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result { + if let Some(val) = pg_value { + if let Kind::Array(elem_type) = pg_type.kind() { + return pg_array_parse(val, elem_type); + } + + match *pg_type { + Type::BOOL => Ok(Value::Bool(val == "t")), + Type::INT2 | Type::INT4 => { + let val = val.parse::()?; + Ok(Value::Number(serde_json::Number::from(val))) + } + Type::FLOAT4 | Type::FLOAT8 => { + let fval = val.parse::()?; + let num = serde_json::Number::from_f64(fval); + if let Some(num) = num { + Ok(Value::Number(num)) + } else { + // Pass Nan, Inf, -Inf as strings + // JS JSON.stringify() does converts them to null, but we + // want to preserve them, so we pass them as strings + Ok(Value::String(val.to_string())) + } + } + Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?), + _ => Ok(Value::String(val.to_string())), + } + } else { + Ok(Value::Null) + } +} + +// +// Parse postgres array into JSON array. +// +// This is a bit involved because we need to handle nested arrays and quoted +// values. Unlike postgres we don't check that all nested arrays have the same +// dimensions, we just return them as is. +// +fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result { + _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v) +} + +fn _pg_array_parse( + pg_array: &str, + elem_type: &Type, + nested: bool, +) -> Result<(Value, usize), JsonConversionError> { + let mut pg_array_chr = pg_array.char_indices(); + let mut level = 0; + let mut quote = false; + let mut entries: Vec = Vec::new(); + let mut entry = String::new(); + + // skip bounds decoration + if let Some('[') = pg_array.chars().next() { + for (_, c) in pg_array_chr.by_ref() { + if c == '=' { + break; + } + } + } + + fn push_checked( + entry: &mut String, + entries: &mut Vec, + elem_type: &Type, + ) -> Result<(), JsonConversionError> { + if !entry.is_empty() { + // While in usual postgres response we get nulls as None and everything else + // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while + // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs + // here while we have quotation info and convert them to None. + if entry == "NULL" { + entries.push(pg_text_to_json(None, elem_type)?); + } else { + entries.push(pg_text_to_json(Some(entry), elem_type)?); + } + entry.clear(); + } + + Ok(()) + } + + while let Some((mut i, mut c)) = pg_array_chr.next() { + let mut escaped = false; + + if c == '\\' { + escaped = true; + (i, c) = pg_array_chr.next().unwrap(); + } + + match c { + '{' if !quote => { + level += 1; + if level > 1 { + let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?; + entries.push(res); + for _ in 0..off - 1 { + pg_array_chr.next(); + } + } + } + '}' if !quote => { + level -= 1; + if level == 0 { + push_checked(&mut entry, &mut entries, elem_type)?; + if nested { + return Ok((Value::Array(entries), i)); + } + } + } + '"' if !escaped => { + if quote { + // end of quoted string, so push it manually without any checks + // for emptiness or nulls + entries.push(pg_text_to_json(Some(&entry), elem_type)?); + entry.clear(); + } + quote = !quote; + } + ',' if !quote => { + push_checked(&mut entry, &mut entries, elem_type)?; + } + _ => { + entry.push(c); + } + } + } + + if level != 0 { + return Err(JsonConversionError::UnbalancedArray); + } + + Ok((Value::Array(entries), 0)) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_atomic_types_to_pg_params() { + let json = vec![Value::Bool(true), Value::Bool(false)]; + let pg_params = json_to_pg_text(json); + assert_eq!( + pg_params, + vec![Some("true".to_owned()), Some("false".to_owned())] + ); + + let json = vec![Value::Number(serde_json::Number::from(42))]; + let pg_params = json_to_pg_text(json); + assert_eq!(pg_params, vec![Some("42".to_owned())]); + + let json = vec![Value::String("foo\"".to_string())]; + let pg_params = json_to_pg_text(json); + assert_eq!(pg_params, vec![Some("foo\"".to_owned())]); + + let json = vec![Value::Null]; + let pg_params = json_to_pg_text(json); + assert_eq!(pg_params, vec![None]); + } + + #[test] + fn test_json_array_to_pg_array() { + // atoms and escaping + let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]"; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]); + assert_eq!( + pg_params, + vec![Some( + "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned() + )] + ); + + // nested arrays + let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]"; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]); + assert_eq!( + pg_params, + vec![Some( + "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned() + )] + ); + // array of objects + let json = r#"[{"foo": 1},{"bar": 2}]"#; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]); + assert_eq!( + pg_params, + vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())] + ); + } + + #[test] + fn test_atomic_types_parse() { + assert_eq!( + pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(), + json!("foo") + ); + assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null)); + assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42)); + assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42)); + assert_eq!( + pg_text_to_json(Some("42"), &Type::INT8).unwrap(), + json!("42") + ); + assert_eq!( + pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(), + json!(42.42) + ); + assert_eq!( + pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(), + json!(42.42) + ); + assert_eq!( + pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(), + json!("NaN") + ); + assert_eq!( + pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(), + json!("Infinity") + ); + assert_eq!( + pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(), + json!("-Infinity") + ); + + let json: Value = + serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}") + .unwrap(); + assert_eq!( + pg_text_to_json( + Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#), + &Type::JSONB + ) + .unwrap(), + json + ); + } + + #[test] + fn test_pg_array_parse_text() { + fn pt(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::TEXT).unwrap() + } + assert_eq!( + pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#), + json!(["aa\"\\,a", "cha", "bbbb"]) + ); + assert_eq!( + pt(r#"{{"foo","bar"},{"bee","bop"}}"#), + json!([["foo", "bar"], ["bee", "bop"]]) + ); + assert_eq!( + pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#), + json!([[[["foo", null, "bop", "bup"]]]]) + ); + assert_eq!( + pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#), + json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]]) + ); + } + + #[test] + fn test_pg_array_parse_bool() { + fn pb(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::BOOL).unwrap() + } + assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true])); + assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]])); + assert_eq!( + pb(r#"{{t,f},{f,t}}"#), + json!([[true, false], [false, true]]) + ); + assert_eq!( + pb(r#"{{t,NULL},{NULL,f}}"#), + json!([[true, null], [null, false]]) + ); + } + + #[test] + fn test_pg_array_parse_numbers() { + fn pn(pg_arr: &str, ty: &Type) -> Value { + pg_array_parse(pg_arr, ty).unwrap() + } + assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0])); + assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0])); + assert_eq!( + pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4), + json!([1.1, 2.2, 3.3]) + ); + assert_eq!( + pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8), + json!([1.1, 2.2, 3.3]) + ); + assert_eq!( + pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4), + json!(["NaN", "Infinity", "-Infinity"]) + ); + assert_eq!( + pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8), + json!(["NaN", "Infinity", "-Infinity"]) + ); + } + + #[test] + fn test_pg_array_with_decoration() { + fn p(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::INT2).unwrap() + } + assert_eq!( + p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#), + json!([[[1, 2, 3], [4, 5, 6]]]) + ); + } + + #[test] + fn test_pg_array_parse_json() { + fn pt(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::JSONB).unwrap() + } + assert_eq!(pt(r#"{"{}"}"#), json!([{}])); + assert_eq!( + pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#), + json!([{"foo": 1, "bar": 2}]) + ); + assert_eq!( + pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#), + json!([{"foo": 1}, {"bar": 2}]) + ); + assert_eq!( + pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#), + json!([[{"foo": 1}, {"bar": 2}]]) + ); + } +} diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 70c0343fa3..7a99aeb759 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,44 +1,74 @@ +use std::pin::pin; use std::sync::Arc; -use anyhow::bail; -use futures::pin_mut; +use bytes::Bytes; +use futures::future::select; +use futures::future::try_join; +use futures::future::Either; use futures::StreamExt; -use hyper::body::HttpBody; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{Body, HeaderMap, Request}; +use futures::TryFutureExt; +use http_body_util::BodyExt; +use http_body_util::Full; +use hyper1::body::Body; +use hyper1::body::Incoming; +use hyper1::header; +use hyper1::http::HeaderName; +use hyper1::http::HeaderValue; +use hyper1::Response; +use hyper1::StatusCode; +use hyper1::{HeaderMap, Request}; +use pq_proto::StartupMessageParamsBuilder; use serde_json::json; -use serde_json::Map; use serde_json::Value; -use smol_str::SmolStr; +use tokio::time; use tokio_postgres::error::DbError; -use tokio_postgres::types::Kind; -use tokio_postgres::types::Type; +use tokio_postgres::error::ErrorPosition; +use tokio_postgres::error::SqlState; use tokio_postgres::GenericClient; use tokio_postgres::IsolationLevel; +use tokio_postgres::NoTls; use tokio_postgres::ReadyForQueryStatus; -use tokio_postgres::Row; use tokio_postgres::Transaction; +use tokio_util::sync::CancellationToken; use tracing::error; -use tracing::instrument; +use tracing::info; use url::Url; use utils::http::error::ApiError; -use utils::http::json::json_response; -use crate::config::HttpConfig; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::endpoint_sni; +use crate::auth::ComputeUserInfoParseError; +use crate::config::ProxyConfig; +use crate::config::TlsConfig; use crate::context::RequestMonitoring; -use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::error::ErrorKind; +use crate::error::ReportableError; +use crate::error::UserFacingError; +use crate::metrics::HttpDirection; +use crate::metrics::Metrics; +use crate::proxy::run_until_cancelled; +use crate::proxy::NeonOptions; +use crate::serverless::backend::HttpConnError; +use crate::usage_metrics::MetricCounterRecorder; +use crate::DbName; +use crate::RoleName; +use super::backend::PoolingBackend; +use super::conn_pool::Client; use super::conn_pool::ConnInfo; -use super::conn_pool::GlobalConnPool; +use super::http_util::json_response; +use super::json::json_to_pg_text; +use super::json::pg_text_row_to_json; +use super::json::JsonConversionError; #[derive(serde::Deserialize)] +#[serde(rename_all = "camelCase")] struct QueryData { query: String, - params: Vec, + #[serde(deserialize_with = "bytes_to_pg_text")] + params: Vec>, + #[serde(default)] + array_mode: Option, } #[derive(serde::Deserialize)] @@ -65,300 +95,483 @@ static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrab static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); -// -// Convert json non-string types to strings, so that they can be passed to Postgres -// as parameters. -// -fn json_to_pg_text(json: Vec) -> Vec> { - json.iter() - .map(|value| { - match value { - // special care for nulls - Value::Null => None, - - // convert to text with escaping - v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()), - - // avoid escaping here, as we pass this as a parameter - Value::String(s) => Some(s.to_string()), - - // special care for arrays - Value::Array(_) => json_array_to_pg_array(value), - } - }) - .collect() +fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result>, D::Error> +where + D: serde::de::Deserializer<'de>, +{ + // TODO: consider avoiding the allocation here. + let json: Vec = serde::de::Deserialize::deserialize(deserializer)?; + Ok(json_to_pg_text(json)) } -// -// Serialize a JSON array to a Postgres array. Contrary to the strings in the params -// in the array we need to escape the strings. Postgres is okay with arrays of form -// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving -// it for Postgres to check. -// -// Example of the same escaping in node-postgres: packages/pg/lib/utils.js -// -fn json_array_to_pg_array(value: &Value) -> Option { - match value { - // special care for nulls - Value::Null => None, +#[derive(Debug, thiserror::Error)] +pub enum ConnInfoError { + #[error("invalid header: {0}")] + InvalidHeader(&'static str), + #[error("invalid connection string: {0}")] + UrlParseError(#[from] url::ParseError), + #[error("incorrect scheme")] + IncorrectScheme, + #[error("missing database name")] + MissingDbName, + #[error("invalid database name")] + InvalidDbName, + #[error("missing username")] + MissingUsername, + #[error("invalid username: {0}")] + InvalidUsername(#[from] std::string::FromUtf8Error), + #[error("missing password")] + MissingPassword, + #[error("missing hostname")] + MissingHostname, + #[error("invalid hostname: {0}")] + InvalidEndpoint(#[from] ComputeUserInfoParseError), + #[error("malformed endpoint")] + MalformedEndpoint, +} - // convert to text with escaping - // here string needs to be escaped, as it is part of the array - v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()), - v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())), +impl ReportableError for ConnInfoError { + fn get_error_kind(&self) -> ErrorKind { + ErrorKind::User + } +} - // recurse into array - Value::Array(arr) => { - let vals = arr - .iter() - .map(json_array_to_pg_array) - .map(|v| v.unwrap_or_else(|| "NULL".to_string())) - .collect::>() - .join(","); - - Some(format!("{{{}}}", vals)) - } +impl UserFacingError for ConnInfoError { + fn to_string_client(&self) -> String { + self.to_string() } } fn get_conn_info( ctx: &mut RequestMonitoring, headers: &HeaderMap, - sni_hostname: Option, -) -> Result { + tls: &TlsConfig, +) -> Result { + // HTTP only uses cleartext (for now and likely always) + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); + let connection_string = headers .get("Neon-Connection-String") - .ok_or(anyhow::anyhow!("missing connection string"))? - .to_str()?; + .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))? + .to_str() + .map_err(|_| ConnInfoError::InvalidHeader("Neon-Connection-String"))?; let connection_url = Url::parse(connection_string)?; let protocol = connection_url.scheme(); if protocol != "postgres" && protocol != "postgresql" { - return Err(anyhow::anyhow!( - "connection string must start with postgres: or postgresql:" - )); + return Err(ConnInfoError::IncorrectScheme); } let mut url_path = connection_url .path_segments() - .ok_or(anyhow::anyhow!("missing database name"))?; + .ok_or(ConnInfoError::MissingDbName)?; - let dbname = url_path - .next() - .ok_or(anyhow::anyhow!("invalid database name"))?; + let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into(); + ctx.set_dbname(dbname.clone()); - let username = SmolStr::from(connection_url.username()); + let username = RoleName::from(urlencoding::decode(connection_url.username())?); if username.is_empty() { - return Err(anyhow::anyhow!("missing username")); + return Err(ConnInfoError::MissingUsername); } ctx.set_user(username.clone()); let password = connection_url .password() - .ok_or(anyhow::anyhow!("no password"))?; - - // TLS certificate selector now based on SNI hostname, so if we are running here - // we are sure that SNI hostname is set to one of the configured domain names. - let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?; + .ok_or(ConnInfoError::MissingPassword)?; + let password = urlencoding::decode_binary(password.as_bytes()); let hostname = connection_url .host_str() - .ok_or(anyhow::anyhow!("no host"))?; + .ok_or(ConnInfoError::MissingHostname)?; - let host_header = headers - .get("host") - .and_then(|h| h.to_str().ok()) - .and_then(|h| h.split(':').next()); - - if hostname != sni_hostname { - return Err(anyhow::anyhow!("mismatched SNI hostname and hostname")); - } else if let Some(h) = host_header { - if h != hostname { - return Err(anyhow::anyhow!("mismatched host header and hostname")); - } - } - - let hostname: SmolStr = hostname.into(); - ctx.set_endpoint_id(Some(hostname.clone())); + let endpoint = + endpoint_sni(hostname, &tls.common_names)?.ok_or(ConnInfoError::MalformedEndpoint)?; + ctx.set_endpoint_id(endpoint.clone()); let pairs = connection_url.query_pairs(); let mut options = Option::None; + let mut params = StartupMessageParamsBuilder::default(); + params.insert("user", &username); + params.insert("database", &dbname); for (key, value) in pairs { + params.insert(&key, &value); if key == "options" { - options = Some(value.into()); - break; + options = Some(NeonOptions::parse_options_raw(&value)); } } + let user_info = ComputeUserInfo { + endpoint, + user: username, + options: options.unwrap_or_default(), + }; + Ok(ConnInfo { - username, - dbname: dbname.into(), - hostname, - password: password.into(), - options, + user_info, + dbname, + password: match password { + std::borrow::Cow::Borrowed(b) => b.into(), + std::borrow::Cow::Owned(b) => b.into(), + }, }) } // TODO: return different http error codes pub async fn handle( - config: &'static HttpConfig, - ctx: &mut RequestMonitoring, - request: Request, - sni_hostname: Option, - conn_pool: Arc, -) -> Result, ApiError> { - let result = tokio::time::timeout( - config.request_timeout, - handle_inner(config, ctx, request, sni_hostname, conn_pool), - ) - .await; + config: &'static ProxyConfig, + mut ctx: RequestMonitoring, + request: Request, + backend: Arc, + cancel: CancellationToken, +) -> Result>, ApiError> { + let result = handle_inner(cancel, config, &mut ctx, request, backend).await; + let mut response = match result { - Ok(r) => match r { - Ok(r) => r, - Err(e) => { - let message = format!("{:?}", e); - let db_error = e - .downcast_ref::() - .and_then(|e| e.as_db_error()); - fn get<'a, T: serde::Serialize>( - db: Option<&'a DbError>, - x: impl FnOnce(&'a DbError) -> T, - ) -> Value { - db.map(x) - .and_then(|t| serde_json::to_value(t).ok()) - .unwrap_or_default() - } + Ok(r) => { + ctx.set_success(); + r + } + Err(e @ SqlOverHttpError::Cancelled(_)) => { + let error_kind = e.get_error_kind(); + ctx.set_error_kind(error_kind); - // TODO(conrad): db_error.position() - let code = get(db_error, |db| db.code().code()); - let severity = get(db_error, |db| db.severity()); - let detail = get(db_error, |db| db.detail()); - let hint = get(db_error, |db| db.hint()); - let where_ = get(db_error, |db| db.where_()); - let table = get(db_error, |db| db.table()); - let column = get(db_error, |db| db.column()); - let schema = get(db_error, |db| db.schema()); - let datatype = get(db_error, |db| db.datatype()); - let constraint = get(db_error, |db| db.constraint()); - let file = get(db_error, |db| db.file()); - let line = get(db_error, |db| db.line()); - let routine = get(db_error, |db| db.routine()); + let message = "Query cancelled, connection was terminated"; - error!( - ?code, - "sql-over-http per-client task finished with an error: {e:#}" - ); - // TODO: this shouldn't always be bad request. - json_response( - StatusCode::BAD_REQUEST, - json!({ - "message": message, - "code": code, - "detail": detail, - "hint": hint, - "severity": severity, - "where": where_, - "table": table, - "column": column, - "schema": schema, - "datatype": datatype, - "constraint": constraint, - "file": file, - "line": line, - "routine": routine, - }), - )? - } - }, - Err(_) => { - let message = format!( - "HTTP-Connection timed out, execution time exeeded {} seconds", - config.request_timeout.as_secs() + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" ); - error!(message); + json_response( - StatusCode::GATEWAY_TIMEOUT, - json!({ "message": message, "code": StatusCode::GATEWAY_TIMEOUT.as_u16() }), + StatusCode::BAD_REQUEST, + json!({ "message": message, "code": SqlState::PROTOCOL_VIOLATION.code() }), + )? + } + Err(e) => { + let error_kind = e.get_error_kind(); + ctx.set_error_kind(error_kind); + + let mut message = e.to_string_client(); + let db_error = match &e { + SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + | SqlOverHttpError::Postgres(e) => e.as_db_error(), + _ => None, + }; + fn get<'a, T: serde::Serialize>( + db: Option<&'a DbError>, + x: impl FnOnce(&'a DbError) -> T, + ) -> Value { + db.map(x) + .and_then(|t| serde_json::to_value(t).ok()) + .unwrap_or_default() + } + + if let Some(db_error) = db_error { + db_error.message().clone_into(&mut message); + } + + let position = db_error.and_then(|db| db.position()); + let (position, internal_position, internal_query) = match position { + Some(ErrorPosition::Original(position)) => ( + Value::String(position.to_string()), + Value::Null, + Value::Null, + ), + Some(ErrorPosition::Internal { position, query }) => ( + Value::Null, + Value::String(position.to_string()), + Value::String(query.clone()), + ), + None => (Value::Null, Value::Null, Value::Null), + }; + + let code = get(db_error, |db| db.code().code()); + let severity = get(db_error, |db| db.severity()); + let detail = get(db_error, |db| db.detail()); + let hint = get(db_error, |db| db.hint()); + let where_ = get(db_error, |db| db.where_()); + let table = get(db_error, |db| db.table()); + let column = get(db_error, |db| db.column()); + let schema = get(db_error, |db| db.schema()); + let datatype = get(db_error, |db| db.datatype()); + let constraint = get(db_error, |db| db.constraint()); + let file = get(db_error, |db| db.file()); + let line = get(db_error, |db| db.line().map(|l| l.to_string())); + let routine = get(db_error, |db| db.routine()); + + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + + // TODO: this shouldn't always be bad request. + json_response( + StatusCode::BAD_REQUEST, + json!({ + "message": message, + "code": code, + "detail": detail, + "hint": hint, + "position": position, + "internalPosition": internal_position, + "internalQuery": internal_query, + "severity": severity, + "where": where_, + "table": table, + "column": column, + "schema": schema, + "dataType": datatype, + "constraint": constraint, + "file": file, + "line": line, + "routine": routine, + }), )? } }; - response.headers_mut().insert( - "Access-Control-Allow-Origin", - hyper::http::HeaderValue::from_static("*"), - ); + + response + .headers_mut() + .insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); Ok(response) } -#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)] +#[derive(Debug, thiserror::Error)] +pub enum SqlOverHttpError { + #[error("{0}")] + ReadPayload(#[from] ReadPayloadError), + #[error("{0}")] + ConnectCompute(#[from] HttpConnError), + #[error("{0}")] + ConnInfo(#[from] ConnInfoError), + #[error("request is too large (max is {MAX_REQUEST_SIZE} bytes)")] + RequestTooLarge, + #[error("response is too large (max is {MAX_RESPONSE_SIZE} bytes)")] + ResponseTooLarge, + #[error("invalid isolation level")] + InvalidIsolationLevel, + #[error("{0}")] + Postgres(#[from] tokio_postgres::Error), + #[error("{0}")] + JsonConversion(#[from] JsonConversionError), + #[error("{0}")] + Cancelled(SqlOverHttpCancel), +} + +impl ReportableError for SqlOverHttpError { + fn get_error_kind(&self) -> ErrorKind { + match self { + SqlOverHttpError::ReadPayload(e) => e.get_error_kind(), + SqlOverHttpError::ConnectCompute(e) => e.get_error_kind(), + SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), + SqlOverHttpError::RequestTooLarge => ErrorKind::User, + SqlOverHttpError::ResponseTooLarge => ErrorKind::User, + SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, + SqlOverHttpError::Postgres(p) => p.get_error_kind(), + SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, + SqlOverHttpError::Cancelled(c) => c.get_error_kind(), + } + } +} + +impl UserFacingError for SqlOverHttpError { + fn to_string_client(&self) -> String { + match self { + SqlOverHttpError::ReadPayload(p) => p.to_string(), + SqlOverHttpError::ConnectCompute(c) => c.to_string_client(), + SqlOverHttpError::ConnInfo(c) => c.to_string_client(), + SqlOverHttpError::RequestTooLarge => self.to_string(), + SqlOverHttpError::ResponseTooLarge => self.to_string(), + SqlOverHttpError::InvalidIsolationLevel => self.to_string(), + SqlOverHttpError::Postgres(p) => p.to_string(), + SqlOverHttpError::JsonConversion(_) => "could not parse postgres response".to_string(), + SqlOverHttpError::Cancelled(_) => self.to_string(), + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum ReadPayloadError { + #[error("could not read the HTTP request body: {0}")] + Read(#[from] hyper1::Error), + #[error("could not parse the HTTP request body: {0}")] + Parse(#[from] serde_json::Error), +} + +impl ReportableError for ReadPayloadError { + fn get_error_kind(&self) -> ErrorKind { + match self { + ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, + ReadPayloadError::Parse(_) => ErrorKind::User, + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum SqlOverHttpCancel { + #[error("query was cancelled")] + Postgres, + #[error("query was cancelled while stuck trying to connect to the database")] + Connect, +} + +impl ReportableError for SqlOverHttpCancel { + fn get_error_kind(&self) -> ErrorKind { + match self { + SqlOverHttpCancel::Postgres => ErrorKind::ClientDisconnect, + SqlOverHttpCancel::Connect => ErrorKind::ClientDisconnect, + } + } +} + +#[derive(Clone, Copy, Debug)] +struct HttpHeaders { + raw_output: bool, + default_array_mode: bool, + txn_isolation_level: Option, + txn_read_only: bool, + txn_deferrable: bool, +} + +impl HttpHeaders { + fn try_parse(headers: &hyper1::http::HeaderMap) -> Result { + // Determine the output options. Default behaviour is 'false'. Anything that is not + // strictly 'true' assumed to be false. + let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); + let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); + + // isolation level, read only and deferrable + let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) { + Some(x) => Some( + map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?, + ), + None => None, + }; + + let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); + let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); + + Ok(Self { + raw_output, + default_array_mode, + txn_isolation_level, + txn_read_only, + txn_deferrable, + }) + } +} + +fn map_header_to_isolation_level(level: &HeaderValue) -> Option { + match level.as_bytes() { + b"Serializable" => Some(IsolationLevel::Serializable), + b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted), + b"ReadCommitted" => Some(IsolationLevel::ReadCommitted), + b"RepeatableRead" => Some(IsolationLevel::RepeatableRead), + _ => None, + } +} + +fn map_isolation_level_to_headers(level: IsolationLevel) -> Option { + match level { + IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")), + IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")), + IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")), + IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")), + _ => None, + } +} + async fn handle_inner( - config: &'static HttpConfig, + cancel: CancellationToken, + config: &'static ProxyConfig, ctx: &mut RequestMonitoring, - request: Request, - sni_hostname: Option, - conn_pool: Arc, -) -> anyhow::Result> { - let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE - .with_label_values(&["http"]) - .guard(); + request: Request, + backend: Arc, +) -> Result>, SqlOverHttpError> { + let _requeset_gauge = Metrics::get().proxy.connection_requests.guard(ctx.protocol); + info!( + protocol = %ctx.protocol, + "handling interactive connection from client" + ); // // Determine the destination and connection params // let headers = request.headers(); - let conn_info = get_conn_info(ctx, headers, sni_hostname)?; - // Determine the output options. Default behaviour is 'false'. Anything that is not - // strictly 'true' assumed to be false. - let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); - let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); + // TLS config should be there. + let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?; + info!(user = conn_info.user_info.user.as_str(), "credentials"); // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in - let allow_pool = - !config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); + let allow_pool = !config.http_config.pool_options.opt_in + || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); - // isolation level, read only and deferrable + let parsed_headers = HttpHeaders::try_parse(headers)?; - let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned(); - let txn_isolation_level = match txn_isolation_level_raw { - Some(ref x) => Some(match x.as_bytes() { - b"Serializable" => IsolationLevel::Serializable, - b"ReadUncommitted" => IsolationLevel::ReadUncommitted, - b"ReadCommitted" => IsolationLevel::ReadCommitted, - b"RepeatableRead" => IsolationLevel::RepeatableRead, - _ => bail!("invalid isolation level"), - }), - None => None, - }; - - let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); - let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); - - let paused = ctx.latency_timer.pause(); let request_content_length = match request.body().size_hint().upper() { Some(v) => v, None => MAX_REQUEST_SIZE + 1, }; - drop(paused); + info!(request_content_length, "request size in bytes"); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Request, request_content_length as f64); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body if request_content_length > MAX_REQUEST_SIZE { - return Err(anyhow::anyhow!( - "request is too large (max is {MAX_REQUEST_SIZE} bytes)" - )); + return Err(SqlOverHttpError::RequestTooLarge); } - // - // Read the query and query params from the request body - // - let body = hyper::body::to_bytes(request.into_body()).await?; - let payload: Payload = serde_json::from_slice(&body)?; + let fetch_and_process_request = Box::pin( + async { + let body = request.into_body().collect().await?.to_bytes(); + info!(length = body.len(), "request payload read"); + let payload: Payload = serde_json::from_slice(&body)?; + Ok::(payload) // Adjust error type accordingly + } + .map_err(SqlOverHttpError::from), + ); - let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?; + let authenticate_and_connect = Box::pin( + async { + let keys = backend + .authenticate(ctx, &config.authentication_config, &conn_info) + .await?; + let client = backend + .connect_to_compute(ctx, conn_info, keys, !allow_pool) + .await?; + // not strictly necessary to mark success here, + // but it's just insurance for if we forget it somewhere else + ctx.latency_timer.success(); + Ok::<_, HttpConnError>(client) + } + .map_err(SqlOverHttpError::from), + ); + + let (payload, mut client) = match run_until_cancelled( + // Run both operations in parallel + try_join( + pin!(fetch_and_process_request), + pin!(authenticate_and_connect), + ), + &cancel, + ) + .await + { + Some(result) => result?, + None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)), + }; let mut response = Response::builder() .status(StatusCode::OK) @@ -367,93 +580,35 @@ async fn handle_inner( // // Now execute the query and return the result // - let mut size = 0; - let result = - match payload { - Payload::Single(stmt) => { - let (status, results) = - query_to_json(&*client, stmt, &mut 0, raw_output, array_mode) - .await - .map_err(|e| { - client.discard(); - e - })?; - client.check_idle(status); - results + let result = match payload { + Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, + Payload::Batch(statements) => { + if parsed_headers.txn_read_only { + response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); } - Payload::Batch(statements) => { - let (inner, mut discard) = client.inner(); - let mut builder = inner.build_transaction(); - if let Some(isolation_level) = txn_isolation_level { - builder = builder.isolation_level(isolation_level); - } - if txn_read_only { - builder = builder.read_only(true); - } - if txn_deferrable { - builder = builder.deferrable(true); - } - - let transaction = builder.start().await.map_err(|e| { - // if we cannot start a transaction, we should return immediately - // and not return to the pool. connection is clearly broken - discard.discard(); - e - })?; - - let results = - match query_batch(&transaction, statements, &mut size, raw_output, array_mode) - .await - { - Ok(results) => { - let status = transaction.commit().await.map_err(|e| { - // if we cannot commit - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - e - })?; - discard.check_idle(status); - results - } - Err(err) => { - let status = transaction.rollback().await.map_err(|e| { - // if we cannot rollback - for now don't return connection to pool - // TODO: get a query status from the error - discard.discard(); - e - })?; - discard.check_idle(status); - return Err(err); - } - }; - - if txn_read_only { - response = response.header( - TXN_READ_ONLY.clone(), - HeaderValue::try_from(txn_read_only.to_string())?, - ); - } - if txn_deferrable { - response = response.header( - TXN_DEFERRABLE.clone(), - HeaderValue::try_from(txn_deferrable.to_string())?, - ); - } - if let Some(txn_isolation_level) = txn_isolation_level_raw { - response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); - } - json!({ "results": results }) + if parsed_headers.txn_deferrable { + response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); + } + if let Some(txn_isolation_level) = parsed_headers + .txn_isolation_level + .and_then(map_isolation_level_to_headers) + { + response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); } - }; - ctx.log(); + statements + .process(cancel, &mut client, parsed_headers) + .await? + } + }; + let metrics = client.metrics(); // how could this possibly fail let body = serde_json::to_string(&result).expect("json serialization should not fail"); let len = body.len(); let response = response - .body(Body::from(body)) + .body(Full::new(Bytes::from(body))) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -461,26 +616,176 @@ async fn handle_inner( // count the egress bytes - we miss the TLS and header overhead but oh well... // moving this later in the stack is going to be a lot of effort and ehhhh metrics.record_egress(len as u64); + Metrics::get() + .proxy + .http_conn_content_length_bytes + .observe(HttpDirection::Response, len as f64); Ok(response) } +impl QueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + + let res = match select( + pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)), + pin!(cancel.cancelled()), + ) + .await + { + // The query successfully completed. + Either::Left((Ok((status, results)), __not_yet_cancelled)) => { + discard.check_idle(status); + Ok(results) + } + // The query failed with an error + Either::Left((Err(e), __not_yet_cancelled)) => { + discard.discard(); + return Err(e); + } + // The query was cancelled. + Either::Right((_cancelled, query)) => { + tracing::info!("cancelling query"); + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // wait for the query cancellation + match time::timeout(time::Duration::from_millis(100), query).await { + // query successed before it was cancelled. + Ok(Ok((status, results))) => { + discard.check_idle(status); + Ok(results) + } + // query failed or was cancelled. + Ok(Err(error)) => { + let db_error = match &error { + SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + | SqlOverHttpError::Postgres(e) => e.as_db_error(), + _ => None, + }; + + // if errored for some other reason, it might not be safe to return + if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { + discard.discard(); + } + + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + Err(_timeout) => { + discard.discard(); + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + } + } + }; + res + } +} + +impl BatchQueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + info!("starting transaction"); + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + let mut builder = inner.build_transaction(); + if let Some(isolation_level) = parsed_headers.txn_isolation_level { + builder = builder.isolation_level(isolation_level); + } + if parsed_headers.txn_read_only { + builder = builder.read_only(true); + } + if parsed_headers.txn_deferrable { + builder = builder.deferrable(true); + } + + let transaction = builder.start().await.map_err(|e| { + // if we cannot start a transaction, we should return immediately + // and not return to the pool. connection is clearly broken + discard.discard(); + e + })?; + + let results = + match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { + Ok(results) => { + info!("commit"); + let status = transaction.commit().await.map_err(|e| { + // if we cannot commit - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + e + })?; + discard.check_idle(status); + results + } + Err(SqlOverHttpError::Cancelled(_)) => { + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. + discard.discard(); + + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + Err(err) => { + info!("rollback"); + let status = transaction.rollback().await.map_err(|e| { + // if we cannot rollback - for now don't return connection to pool + // TODO: get a query status from the error + discard.discard(); + e + })?; + discard.check_idle(status); + return Err(err); + } + }; + + Ok(json!({ "results": results })) + } +} + async fn query_batch( + cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, - total_size: &mut usize, - raw_output: bool, - array_mode: bool, -) -> anyhow::Result> { + parsed_headers: HttpHeaders, +) -> Result, SqlOverHttpError> { let mut results = Vec::with_capacity(queries.queries.len()); let mut current_size = 0; for stmt in queries.queries { - // TODO: maybe we should check that the transaction bit is set here - let (_, values) = - query_to_json(transaction, stmt, &mut current_size, raw_output, array_mode).await?; - results.push(values); + let query = pin!(query_to_json( + transaction, + stmt, + &mut current_size, + parsed_headers, + )); + let cancelled = pin!(cancel.cancelled()); + let res = select(query, cancelled).await; + match res { + // TODO: maybe we should check that the transaction bit is set here + Either::Left((Ok((_, values)), _cancelled)) => { + results.push(values); + } + Either::Left((Err(e), _cancelled)) => { + return Err(e); + } + Either::Right((_cancelled, _)) => { + return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); + } + } } - *total_size += current_size; Ok(results) } @@ -488,16 +793,16 @@ async fn query_to_json( client: &T, data: QueryData, current_size: &mut usize, - raw_output: bool, - array_mode: bool, -) -> anyhow::Result<(ReadyForQueryStatus, Value)> { - let query_params = json_to_pg_text(data.params); - let row_stream = client.query_raw_txt(&data.query, query_params).await?; + parsed_headers: HttpHeaders, +) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> { + info!("executing query"); + let query_params = data.params; + let mut row_stream = std::pin::pin!(client.query_raw_txt(&data.query, query_params).await?); + info!("finished executing query"); // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. - pin_mut!(row_stream); let mut rows: Vec = Vec::new(); while let Some(row) = row_stream.next().await { let row = row?; @@ -506,9 +811,7 @@ async fn query_to_json( // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) if *current_size > MAX_RESPONSE_SIZE { - return Err(anyhow::anyhow!( - "response is too large (max is {MAX_RESPONSE_SIZE} bytes)" - )); + return Err(SqlOverHttpError::ResponseTooLarge); } } @@ -527,6 +830,13 @@ async fn query_to_json( } .and_then(|s| s.parse::().ok()); + info!( + rows = rows.len(), + ?ready, + command_tag, + "finished reading rows" + ); + let mut fields = vec![]; let mut columns = vec![]; @@ -543,10 +853,12 @@ async fn query_to_json( columns.push(client.get_type(c.type_oid()).await?); } + let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); + // convert rows to JSON let rows = rows .iter() - .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode)) + .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) .collect::, _>>()?; // resulting JSON format is based on the format of node-postgres result @@ -561,389 +873,3 @@ async fn query_to_json( }), )) } - -// -// Convert postgres row with text-encoded values to JSON object -// -pub fn pg_text_row_to_json( - row: &Row, - columns: &[Type], - raw_output: bool, - array_mode: bool, -) -> Result { - let iter = row - .columns() - .iter() - .zip(columns) - .enumerate() - .map(|(i, (column, typ))| { - let name = column.name(); - let pg_value = row.as_text(i)?; - let json_value = if raw_output { - match pg_value { - Some(v) => Value::String(v.to_string()), - None => Value::Null, - } - } else { - pg_text_to_json(pg_value, typ)? - }; - Ok((name.to_string(), json_value)) - }); - - if array_mode { - // drop keys and aggregate into array - let arr = iter - .map(|r| r.map(|(_key, val)| val)) - .collect::, anyhow::Error>>()?; - Ok(Value::Array(arr)) - } else { - let obj = iter.collect::, anyhow::Error>>()?; - Ok(Value::Object(obj)) - } -} - -// -// Convert postgres text-encoded value to JSON value -// -pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result { - if let Some(val) = pg_value { - if let Kind::Array(elem_type) = pg_type.kind() { - return pg_array_parse(val, elem_type); - } - - match *pg_type { - Type::BOOL => Ok(Value::Bool(val == "t")), - Type::INT2 | Type::INT4 => { - let val = val.parse::()?; - Ok(Value::Number(serde_json::Number::from(val))) - } - Type::FLOAT4 | Type::FLOAT8 => { - let fval = val.parse::()?; - let num = serde_json::Number::from_f64(fval); - if let Some(num) = num { - Ok(Value::Number(num)) - } else { - // Pass Nan, Inf, -Inf as strings - // JS JSON.stringify() does converts them to null, but we - // want to preserve them, so we pass them as strings - Ok(Value::String(val.to_string())) - } - } - Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?), - _ => Ok(Value::String(val.to_string())), - } - } else { - Ok(Value::Null) - } -} - -// -// Parse postgres array into JSON array. -// -// This is a bit involved because we need to handle nested arrays and quoted -// values. Unlike postgres we don't check that all nested arrays have the same -// dimensions, we just return them as is. -// -fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result { - _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v) -} - -fn _pg_array_parse( - pg_array: &str, - elem_type: &Type, - nested: bool, -) -> Result<(Value, usize), anyhow::Error> { - let mut pg_array_chr = pg_array.char_indices(); - let mut level = 0; - let mut quote = false; - let mut entries: Vec = Vec::new(); - let mut entry = String::new(); - - // skip bounds decoration - if let Some('[') = pg_array.chars().next() { - for (_, c) in pg_array_chr.by_ref() { - if c == '=' { - break; - } - } - } - - fn push_checked( - entry: &mut String, - entries: &mut Vec, - elem_type: &Type, - ) -> Result<(), anyhow::Error> { - if !entry.is_empty() { - // While in usual postgres response we get nulls as None and everything else - // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while - // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs - // here while we have quotation info and convert them to None. - if entry == "NULL" { - entries.push(pg_text_to_json(None, elem_type)?); - } else { - entries.push(pg_text_to_json(Some(entry), elem_type)?); - } - entry.clear(); - } - - Ok(()) - } - - while let Some((mut i, mut c)) = pg_array_chr.next() { - let mut escaped = false; - - if c == '\\' { - escaped = true; - (i, c) = pg_array_chr.next().unwrap(); - } - - match c { - '{' if !quote => { - level += 1; - if level > 1 { - let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?; - entries.push(res); - for _ in 0..off - 1 { - pg_array_chr.next(); - } - } - } - '}' if !quote => { - level -= 1; - if level == 0 { - push_checked(&mut entry, &mut entries, elem_type)?; - if nested { - return Ok((Value::Array(entries), i)); - } - } - } - '"' if !escaped => { - if quote { - // end of quoted string, so push it manually without any checks - // for emptiness or nulls - entries.push(pg_text_to_json(Some(&entry), elem_type)?); - entry.clear(); - } - quote = !quote; - } - ',' if !quote => { - push_checked(&mut entry, &mut entries, elem_type)?; - } - _ => { - entry.push(c); - } - } - } - - if level != 0 { - return Err(anyhow::anyhow!("unbalanced array")); - } - - Ok((Value::Array(entries), 0)) -} - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_atomic_types_to_pg_params() { - let json = vec![Value::Bool(true), Value::Bool(false)]; - let pg_params = json_to_pg_text(json); - assert_eq!( - pg_params, - vec![Some("true".to_owned()), Some("false".to_owned())] - ); - - let json = vec![Value::Number(serde_json::Number::from(42))]; - let pg_params = json_to_pg_text(json); - assert_eq!(pg_params, vec![Some("42".to_owned())]); - - let json = vec![Value::String("foo\"".to_string())]; - let pg_params = json_to_pg_text(json); - assert_eq!(pg_params, vec![Some("foo\"".to_owned())]); - - let json = vec![Value::Null]; - let pg_params = json_to_pg_text(json); - assert_eq!(pg_params, vec![None]); - } - - #[test] - fn test_json_array_to_pg_array() { - // atoms and escaping - let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]"; - let json: Value = serde_json::from_str(json).unwrap(); - let pg_params = json_to_pg_text(vec![json]); - assert_eq!( - pg_params, - vec![Some( - "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned() - )] - ); - - // nested arrays - let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]"; - let json: Value = serde_json::from_str(json).unwrap(); - let pg_params = json_to_pg_text(vec![json]); - assert_eq!( - pg_params, - vec![Some( - "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned() - )] - ); - // array of objects - let json = r#"[{"foo": 1},{"bar": 2}]"#; - let json: Value = serde_json::from_str(json).unwrap(); - let pg_params = json_to_pg_text(vec![json]); - assert_eq!( - pg_params, - vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())] - ); - } - - #[test] - fn test_atomic_types_parse() { - assert_eq!( - pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(), - json!("foo") - ); - assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42)); - assert_eq!( - pg_text_to_json(Some("42"), &Type::INT8).unwrap(), - json!("42") - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(), - json!("NaN") - ); - assert_eq!( - pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(), - json!("Infinity") - ); - assert_eq!( - pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(), - json!("-Infinity") - ); - - let json: Value = - serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}") - .unwrap(); - assert_eq!( - pg_text_to_json( - Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#), - &Type::JSONB - ) - .unwrap(), - json - ); - } - - #[test] - fn test_pg_array_parse_text() { - fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::TEXT).unwrap() - } - assert_eq!( - pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#), - json!(["aa\"\\,a", "cha", "bbbb"]) - ); - assert_eq!( - pt(r#"{{"foo","bar"},{"bee","bop"}}"#), - json!([["foo", "bar"], ["bee", "bop"]]) - ); - assert_eq!( - pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#), - json!([[[["foo", null, "bop", "bup"]]]]) - ); - assert_eq!( - pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#), - json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]]) - ); - } - - #[test] - fn test_pg_array_parse_bool() { - fn pb(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::BOOL).unwrap() - } - assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true])); - assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]])); - assert_eq!( - pb(r#"{{t,f},{f,t}}"#), - json!([[true, false], [false, true]]) - ); - assert_eq!( - pb(r#"{{t,NULL},{NULL,f}}"#), - json!([[true, null], [null, false]]) - ); - } - - #[test] - fn test_pg_array_parse_numbers() { - fn pn(pg_arr: &str, ty: &Type) -> Value { - pg_array_parse(pg_arr, ty).unwrap() - } - assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0])); - assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0])); - assert_eq!( - pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4), - json!([1.1, 2.2, 3.3]) - ); - assert_eq!( - pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8), - json!([1.1, 2.2, 3.3]) - ); - assert_eq!( - pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4), - json!(["NaN", "Infinity", "-Infinity"]) - ); - assert_eq!( - pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8), - json!(["NaN", "Infinity", "-Infinity"]) - ); - } - - #[test] - fn test_pg_array_with_decoration() { - fn p(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::INT2).unwrap() - } - assert_eq!( - p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#), - json!([[[1, 2, 3], [4, 5, 6]]]) - ); - } - #[test] - fn test_pg_array_parse_json() { - fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::JSONB).unwrap() - } - assert_eq!(pt(r#"{"{}"}"#), json!([{}])); - assert_eq!( - pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#), - json!([{"foo": 1, "bar": 2}]) - ); - assert_eq!( - pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#), - json!([{"foo": 1}, {"bar": 2}]) - ); - assert_eq!( - pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#), - json!([[{"foo": 1}, {"bar": 2}]]) - ); - } -} diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index a6529c920a..0e9772733d 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,15 +1,17 @@ use crate::{ - cancellation::CancelMap, + cancellation::CancellationHandlerMain, config::ProxyConfig, context::RequestMonitoring, - error::io_error, + error::{io_error, ReportableError}, + metrics::Metrics, proxy::{handle_client, ClientMode}, rate_limiter::EndpointRateLimiter, }; -use bytes::{Buf, Bytes}; +use bytes::{Buf, BufMut, Bytes, BytesMut}; +use framed_websockets::{Frame, OpCode, WebSocketServer}; use futures::{Sink, Stream}; -use hyper::upgrade::Upgraded; -use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream}; +use hyper1::upgrade::OnUpgrade; +use hyper_util::rt::TokioIo; use pin_project_lite::pin_project; use std::{ @@ -20,25 +22,23 @@ use std::{ use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; -// TODO: use `std::sync::Exclusive` once it's stabilized. -// Tracking issue: https://github.com/rust-lang/rust/issues/98407. -use sync_wrapper::SyncWrapper; - pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. - pub struct WebSocketRw { + pub struct WebSocketRw { #[pin] - stream: SyncWrapper>, - bytes: Bytes, + stream: WebSocketServer, + recv: Bytes, + send: BytesMut, } } impl WebSocketRw { - pub fn new(stream: WebSocketStream) -> Self { + pub fn new(stream: WebSocketServer) -> Self { Self { - stream: stream.into(), - bytes: Bytes::new(), + stream, + recv: Bytes::new(), + send: BytesMut::new(), } } } @@ -49,22 +49,25 @@ impl AsyncWrite for WebSocketRw { cx: &mut Context<'_>, buf: &[u8], ) -> Poll> { - let mut stream = self.project().stream.get_pin_mut(); + let this = self.project(); + let mut stream = this.stream; ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?; - match stream.as_mut().start_send(Message::Binary(buf.into())) { + + this.send.put(buf); + match stream.as_mut().start_send(Frame::binary(this.send.split())) { Ok(()) => Poll::Ready(Ok(buf.len())), Err(e) => Poll::Ready(Err(io_error(e))), } } fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let stream = self.project().stream.get_pin_mut(); + let stream = self.project().stream; stream.poll_flush(cx).map_err(io_error) } fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - let stream = self.project().stream.get_pin_mut(); + let stream = self.project().stream; stream.poll_close(cx).map_err(io_error) } } @@ -75,13 +78,10 @@ impl AsyncRead for WebSocketRw { cx: &mut Context<'_>, buf: &mut ReadBuf<'_>, ) -> Poll> { - if buf.remaining() > 0 { - let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; - let len = std::cmp::min(bytes.len(), buf.remaining()); - buf.put_slice(&bytes[..len]); - self.consume(len); - } - + let bytes = ready!(self.as_mut().poll_fill_buf(cx))?; + let len = std::cmp::min(bytes.len(), buf.remaining()); + buf.put_slice(&bytes[..len]); + self.consume(len); Poll::Ready(Ok(())) } } @@ -93,31 +93,27 @@ impl AsyncBufRead for WebSocketRw { let mut this = self.project(); loop { - if !this.bytes.chunk().is_empty() { - let chunk = (*this.bytes).chunk(); + if !this.recv.chunk().is_empty() { + let chunk = (*this.recv).chunk(); return Poll::Ready(Ok(chunk)); } - let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx)); + let res = ready!(this.stream.as_mut().poll_next(cx)); match res.transpose().map_err(io_error)? { - Some(message) => match message { - Message::Ping(_) => {} - Message::Pong(_) => {} - Message::Text(text) => { + Some(message) => match message.opcode { + OpCode::Ping => {} + OpCode::Pong => {} + OpCode::Text => { // We expect to see only binary messages. let error = "unexpected text message in the websocket"; - warn!(length = text.len(), error); + warn!(length = message.payload.len(), error); return Poll::Ready(Err(io_error(error))); } - Message::Frame(_) => { - // This case is impossible according to Frame's doc. - panic!("unexpected raw frame in the websocket"); + OpCode::Binary | OpCode::Continuation => { + debug_assert!(this.recv.is_empty()); + *this.recv = message.payload.freeze(); } - Message::Binary(chunk) => { - assert!(this.bytes.is_empty()); - *this.bytes = Bytes::from(chunk); - } - Message::Close(_) => return EOF, + OpCode::Close => return EOF, }, None => return EOF, } @@ -125,44 +121,69 @@ impl AsyncBufRead for WebSocketRw { } fn consume(self: Pin<&mut Self>, amount: usize) { - self.project().bytes.advance(amount); + self.project().recv.advance(amount); } } pub async fn serve_websocket( config: &'static ProxyConfig, - ctx: &mut RequestMonitoring, - websocket: HyperWebsocket, - cancel_map: &CancelMap, - hostname: Option, + mut ctx: RequestMonitoring, + websocket: OnUpgrade, + cancellation_handler: Arc, endpoint_rate_limiter: Arc, + hostname: Option, ) -> anyhow::Result<()> { let websocket = websocket.await?; - handle_client( + let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket)); + + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Ws); + + let res = Box::pin(handle_client( config, - ctx, - cancel_map, + &mut ctx, + cancellation_handler, WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, endpoint_rate_limiter, - ) - .await?; - Ok(()) + conn_gauge, + )) + .await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + Err(e.into()) + } + Ok(None) => { + ctx.set_success(); + Ok(()) + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log_connect(); + p.proxy_pass().await + } + } } #[cfg(test)] mod tests { use std::pin::pin; + use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use hyper_tungstenite::{ - tungstenite::{protocol::Role, Message}, - WebSocketStream, - }; use tokio::{ io::{duplex, AsyncReadExt, AsyncWriteExt}, task::JoinSet, }; + use tokio_tungstenite::{ + tungstenite::{protocol::Role, Message}, + WebSocketStream, + }; use super::WebSocketRw; @@ -187,9 +208,7 @@ mod tests { }); js.spawn(async move { - let mut rw = pin!(WebSocketRw::new( - WebSocketStream::from_raw_socket(stream2, Role::Server, None).await - )); + let mut rw = pin!(WebSocketRw::new(WebSocketServer::after_handshake(stream2))); let mut buf = vec![0; 1024]; let n = rw.read(&mut buf).await.unwrap(); diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index f48b3fe39f..690e92ffb1 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,6 +1,6 @@ use crate::config::TlsServerEndPoint; -use crate::error::UserFacingError; -use anyhow::bail; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::Metrics; use bytes::BytesMut; use pq_proto::framed::{ConnectionError, Framed}; @@ -73,6 +73,30 @@ impl PqStream { } } +#[derive(Debug)] +pub struct ReportedError { + source: anyhow::Error, + error_kind: ErrorKind, +} + +impl std::fmt::Display for ReportedError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.source.fmt(f) + } +} + +impl std::error::Error for ReportedError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.source.source() + } +} + +impl ReportableError for ReportedError { + fn get_error_kind(&self) -> ErrorKind { + self.error_kind + } +} + impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { @@ -98,24 +122,52 @@ impl PqStream { /// Write the error message using [`Self::write_message`], then re-throw it. /// Allowing string literals is safe under the assumption they might not contain any runtime info. /// This method exists due to `&str` not implementing `Into`. - pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { - tracing::info!("forwarding error to user: {error}"); - self.write_message(&BeMessage::ErrorResponse(error, None)) - .await?; - bail!(error) + pub async fn throw_error_str( + &mut self, + msg: &'static str, + error_kind: ErrorKind, + ) -> Result { + tracing::info!( + kind = error_kind.to_metric_label(), + msg, + "forwarding error to user" + ); + + // already error case, ignore client IO error + let _: Result<_, std::io::Error> = self + .write_message(&BeMessage::ErrorResponse(msg, None)) + .await; + + Err(ReportedError { + source: anyhow::anyhow!(msg), + error_kind, + }) } /// Write the error message using [`Self::write_message`], then re-throw it. /// Trait [`UserFacingError`] acts as an allowlist for error types. - pub async fn throw_error(&mut self, error: E) -> anyhow::Result + pub async fn throw_error(&mut self, error: E) -> Result where E: UserFacingError + Into, { + let error_kind = error.get_error_kind(); let msg = error.to_string_client(); - tracing::info!("forwarding error to user: {msg}"); - self.write_message(&BeMessage::ErrorResponse(&msg, None)) - .await?; - bail!(error) + tracing::info!( + kind=error_kind.to_metric_label(), + error=%error, + msg, + "forwarding error to user" + ); + + // already error case, ignore client IO error + let _: Result<_, std::io::Error> = self + .write_message(&BeMessage::ErrorResponse(&msg, None)) + .await; + + Err(ReportedError { + source: anyhow::anyhow!(error), + error_kind, + }) } } @@ -171,9 +223,20 @@ pub enum StreamUpgradeError { impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> Result, StreamUpgradeError> { + pub async fn upgrade( + self, + cfg: Arc, + record_handshake_error: bool, + ) -> Result, StreamUpgradeError> { match self { - Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?), + Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg) + .accept(raw) + .await + .inspect_err(|_| { + if record_handshake_error { + Metrics::get().proxy.tls_handshake_failures.inc() + } + })?), Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 789a4c680c..56ed2145dc 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,21 +1,35 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::{config::MetricCollectionConfig, http}; -use chrono::{DateTime, Utc}; +use crate::{ + config::{MetricBackupCollectionConfig, MetricCollectionConfig}, + context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, + http, + intern::{BranchIdInt, EndpointIdInt}, +}; +use anyhow::Context; +use async_compression::tokio::write::GzipEncoder; +use bytes::Bytes; +use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use dashmap::{mapref::entry::Entry, DashMap}; +use futures::future::select; use once_cell::sync::Lazy; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; -use smol_str::SmolStr; use std::{ convert::Infallible, + pin::pin, sync::{ atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, }, time::Duration, }; +use tokio::io::AsyncWriteExt; +use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace}; +use utils::backoff; +use uuid::{NoContext, Timestamp}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -30,23 +44,97 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// because we enrich the event with project_id in the control-plane endpoint. #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] pub struct Ids { - pub endpoint_id: SmolStr, - pub branch_id: SmolStr, + pub endpoint_id: EndpointIdInt, + pub branch_id: BranchIdInt, +} + +pub trait MetricCounterRecorder { + /// Record that some bytes were sent from the proxy to the client + fn record_egress(&self, bytes: u64); + /// Record that some connections were opened + fn record_connection(&self, count: usize); +} + +trait MetricCounterReporter { + fn get_metrics(&mut self) -> (u64, usize); + fn move_metrics(&self) -> (u64, usize); +} + +#[derive(Debug)] +struct MetricBackupCounter { + transmitted: AtomicU64, + opened_connections: AtomicUsize, +} + +impl MetricCounterRecorder for MetricBackupCounter { + fn record_egress(&self, bytes: u64) { + self.transmitted.fetch_add(bytes, Ordering::AcqRel); + } + + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + } +} + +impl MetricCounterReporter for MetricBackupCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } } #[derive(Debug)] pub struct MetricCounter { transmitted: AtomicU64, opened_connections: AtomicUsize, + backup: Arc, } -impl MetricCounter { +impl MetricCounterRecorder for MetricCounter { /// Record that some bytes were sent from the proxy to the client - pub fn record_egress(&self, bytes: u64) { + fn record_egress(&self, bytes: u64) { self.transmitted.fetch_add(bytes, Ordering::AcqRel); + self.backup.record_egress(bytes); } + /// Record that some connections were opened + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + self.backup.record_connection(count); + } +} + +impl MetricCounterReporter for MetricCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } +} + +trait Clearable { /// extract the value that should be reported + fn should_report(self: &Arc) -> Option; + /// Determine whether the counter should be cleared from the global map. + fn should_clear(self: &mut Arc) -> bool; +} + +impl Clearable for C { fn should_report(self: &Arc) -> Option { // heuristic to see if the branch is still open // if a clone happens while we are observing, the heuristic will be incorrect. @@ -55,13 +143,12 @@ impl MetricCounter { // However, for the strong count to be 1 it must have occured that at one instant // all the endpoints were closed, so missing a report because the endpoints are closed is valid. let is_open = Arc::strong_count(self) > 1; - let opened = self.opened_connections.swap(0, Ordering::AcqRel); // update cached metrics eagerly, even if they can't get sent // (to avoid sending the same metrics twice) // see the relevant discussion on why to do so even if the status is not success: // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 - let value = self.transmitted.swap(0, Ordering::AcqRel); + let (value, opened) = self.move_metrics(); // Our only requirement is that we report in every interval if there was an open connection // if there were no opened connections since, then we don't need to report @@ -71,15 +158,12 @@ impl MetricCounter { Some(value) } } - - /// Determine whether the counter should be cleared from the global map. fn should_clear(self: &mut Arc) -> bool { // we can't clear this entry if it's acquired elsewhere let Some(counter) = Arc::get_mut(self) else { return false; }; - let opened = *counter.opened_connections.get_mut(); - let value = *counter.transmitted.get_mut(); + let (opened, value) = counter.get_metrics(); // clear if there's no data to report value == 0 && opened == 0 } @@ -91,11 +175,26 @@ type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] pub struct Metrics { endpoints: DashMap, FastHasher>, + backup_endpoints: DashMap, FastHasher>, } impl Metrics { /// Register a new byte metrics counter for this endpoint pub fn register(&self, ids: Ids) -> Arc { + let backup = if let Some(entry) = self.backup_endpoints.get(&ids) { + entry.clone() + } else { + self.backup_endpoints + .entry(ids.clone()) + .or_insert_with(|| { + Arc::new(MetricBackupCounter { + transmitted: AtomicU64::new(0), + opened_connections: AtomicUsize::new(0), + }) + }) + .clone() + }; + let entry = if let Some(entry) = self.endpoints.get(&ids) { entry.clone() } else { @@ -105,12 +204,13 @@ impl Metrics { Arc::new(MetricCounter { transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), + backup: backup.clone(), }) }) .clone() }; - entry.opened_connections.fetch_add(1, Ordering::AcqRel); + entry.record_connection(1); entry } } @@ -133,7 +233,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result anyhow::Result, - now: DateTime, -) { - info!( - "starting collect_metrics_iteration. metric_collection_endpoint: {}", - metric_collection_endpoint - ); - +fn collect_and_clear_metrics( + endpoints: &DashMap, FastHasher>, +) -> Vec<(Ids, u64)> { let mut metrics_to_clear = Vec::new(); - let metrics_to_send: Vec<(Ids, u64)> = metrics - .endpoints + let metrics_to_send: Vec<(Ids, u64)> = endpoints .iter() .filter_map(|counter| { let key = counter.key().clone(); @@ -174,33 +262,71 @@ async fn collect_metrics_iteration( }) .collect(); + for metric in metrics_to_clear { + match endpoints.entry(metric) { + Entry::Occupied(mut counter) => { + if counter.get_mut().should_clear() { + counter.remove_entry(); + } + } + Entry::Vacant(_) => {} + } + } + metrics_to_send +} + +fn create_event_chunks<'a>( + metrics_to_send: &'a [(Ids, u64)], + hostname: &'a str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) -> impl Iterator>> + 'a { + // Split into chunks of 1000 metrics to avoid exceeding the max request size + metrics_to_send + .chunks(chunk_size) + .map(move |chunk| EventChunk { + events: chunk + .iter() + .map(|(ids, value)| Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: *value, + extra: ids.clone(), + }) + .collect(), + }) +} + +#[instrument(skip_all)] +async fn collect_metrics_iteration( + endpoints: &DashMap, FastHasher>, + client: &http::ClientWithMiddleware, + metric_collection_endpoint: &reqwest::Url, + hostname: &str, + prev: DateTime, + now: DateTime, +) { + info!( + "starting collect_metrics_iteration. metric_collection_endpoint: {}", + metric_collection_endpoint + ); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + if metrics_to_send.is_empty() { trace!("no new metrics to send"); } // Send metrics. - // Split into chunks of 1000 metrics to avoid exceeding the max request size - for chunk in metrics_to_send.chunks(CHUNK_SIZE) { - let events = chunk - .iter() - .map(|(ids, value)| Event { - kind: EventType::Incremental { - start_time: prev, - stop_time: now, - }, - metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname), - value: *value, - extra: Ids { - endpoint_id: ids.endpoint_id.clone(), - branch_id: ids.branch_id.clone(), - }, - }) - .collect(); - + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) { let res = client .post(metric_collection_endpoint.clone()) - .json(&EventChunk { events }) + .json(&chunk) .send() .await; @@ -214,23 +340,142 @@ async fn collect_metrics_iteration( if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) { + for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large error!("potentially abnormal metric value: {:?}", metric); } } } +} - for metric in metrics_to_clear { - match metrics.endpoints.entry(metric) { - Entry::Occupied(mut counter) => { - if counter.get_mut().should_clear() { - counter.remove_entry(); - } - } - Entry::Vacant(_) => {} +pub async fn task_backup( + backup_config: &MetricBackupCollectionConfig, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + info!("metrics backup config: {backup_config:?}"); + scopeguard::defer! { + info!("metrics backup has shut down"); + } + // Even if the remote storage is not configured, we still want to clear the metrics. + let storage = backup_config + .remote_storage_config + .as_ref() + .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init")) + .transpose()?; + let mut ticker = tokio::time::interval(backup_config.interval); + let mut prev = Utc::now(); + let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); + loop { + select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await; + let now = Utc::now(); + collect_metrics_backup_iteration( + &USAGE_METRICS.backup_endpoints, + &storage, + &hostname, + prev, + now, + backup_config.chunk_size, + ) + .await; + + prev = now; + if cancellation_token.is_cancelled() { + info!("metrics backup has been cancelled"); + break; } } + Ok(()) +} + +#[instrument(skip_all)] +async fn collect_metrics_backup_iteration( + endpoints: &DashMap, FastHasher>, + storage: &Option, + hostname: &str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) { + let year = now.year(); + let month = now.month(); + let day = now.day(); + let hour = now.hour(); + let minute = now.minute(); + let second = now.second(); + let cancel = CancellationToken::new(); + + info!("starting collect_metrics_backup_iteration"); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + + if metrics_to_send.is_empty() { + trace!("no new metrics to send"); + } + + // Send metrics. + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) { + let real_now = Utc::now(); + let id = uuid::Uuid::new_v7(Timestamp::from_unix( + NoContext, + real_now.second().into(), + real_now.nanosecond(), + )); + let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz"); + let remote_path = match RemotePath::from_string(&path) { + Ok(remote_path) => remote_path, + Err(e) => { + error!("failed to create remote path from str {path}: {:?}", e); + continue; + } + }; + + let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await; + + if let Err(e) = res { + error!( + "failed to upload consumption events to remote storage: {:?}", + e + ); + } + } +} + +async fn upload_events_chunk( + storage: &Option, + chunk: EventChunk<'_, Event>, + remote_path: &RemotePath, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let storage = match storage { + Some(storage) => storage, + None => { + error!("no remote storage configured"); + return Ok(()); + } + }; + let data = serde_json::to_vec(&chunk).context("serialize metrics")?; + let mut encoder = GzipEncoder::new(Vec::new()); + encoder.write_all(&data).await.context("compress metrics")?; + encoder.shutdown().await.context("compress metrics")?; + let compressed_data: Bytes = encoder.get_ref().clone().into(); + backoff::retry( + || async { + let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone()))); + storage + .upload(stream, compressed_data.len(), remote_path, None, cancel) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_UPLOAD_MAX_RETRIES, + "request_data_upload", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("request_data_upload")?; + Ok(()) } #[cfg(test)] @@ -249,8 +494,8 @@ mod tests { }; use url::Url; - use super::{collect_metrics_iteration, Ids, Metrics}; - use crate::{http, rate_limiter::RateLimiterConfig}; + use super::*; + use crate::{http, BranchId, EndpointId}; #[tokio::test] async fn metrics() { @@ -280,23 +525,24 @@ mod tests { tokio::spawn(server); let metrics = Metrics::default(); - let client = http::new_client(RateLimiterConfig::default()); + let client = http::new_client(); let endpoint = Url::parse(&format!("http://{addr}")).unwrap(); let now = Utc::now(); // no counters have been registered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // register a new counter + let counter = metrics.register(Ids { - endpoint_id: "e1".into(), - branch_id: "b1".into(), + endpoint_id: (&EndpointId::from("e1")).into(), + branch_id: (&BranchId::from("b1")).into(), }); // the counter should be observed despite 0 egress - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -306,7 +552,7 @@ mod tests { counter.record_egress(1); // egress should be observered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -316,11 +562,19 @@ mod tests { drop(counter); // we do not observe the counter - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // counter is unregistered assert!(metrics.endpoints.is_empty()); + + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + assert!(!metrics.backup_endpoints.is_empty()); + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + // backup counter is unregistered after the second iteration + assert!(metrics.backup_endpoints.is_empty()); } } diff --git a/pyproject.toml b/pyproject.toml index bb04123e05..c7f1a07512 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,21 +10,21 @@ pytest = "^7.4.4" psycopg2-binary = "^2.9.6" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} -requests = "^2.31.0" +requests = "^2.32.0" pytest-xdist = "^3.3.1" asyncpg = "^0.29.0" aiopg = "^1.4.0" -Jinja2 = "^3.0.2" +Jinja2 = "^3.1.4" types-requests = "^2.31.0.0" types-psycopg2 = "^2.9.21.10" boto3 = "^1.34.11" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} -moto = {extras = ["server"], version = "^4.1.2"} +moto = {extras = ["server"], version = "^5.0.6"} backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" -Werkzeug = "^3.0.1" +Werkzeug = "^3.0.3" pytest-order = "^1.1.0" allure-pytest = "^2.13.2" pytest-asyncio = "^0.21.0" @@ -33,22 +33,29 @@ psutil = "^5.9.4" types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" -aiohttp = "3.9.0" +aiohttp = "3.9.4" pytest-rerunfailures = "^13.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" zstandard = "^0.21.0" +httpx = {extras = ["http2"], version = "^0.26.0"} +pytest-repeat = "^0.9.3" +websockets = "^12.0" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" -ruff = "^0.1.11" +ruff = "^0.2.2" [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.mypy] -exclude = "^vendor/" +exclude = [ + "^vendor/", + "^target/", + "test_runner/performance/pgvector/loaddata.py", +] check_untyped_defs = true # Help mypy find imports when running against list of individual files. # Without this line it would behave differently when executed on the entire project. @@ -72,7 +79,13 @@ ignore_missing_imports = true [tool.ruff] target-version = "py39" -extend-exclude = ["vendor/"] +extend-exclude = [ + "vendor/", + "target/", +] +line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter + +[tool.ruff.lint] ignore = [ "E501", # Line too long, we don't want to be too strict about it ] @@ -82,5 +95,5 @@ select = [ "I", # isort "W", # pycodestyle "B", # bugbear + "UP032", # f-string ] -line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 9b5a965f7d..dcae25a287 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.75.0" +channel = "1.79.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs deleted file mode 100644 index 957213856b..0000000000 --- a/s3_scrubber/src/main.rs +++ /dev/null @@ -1,106 +0,0 @@ -use pageserver_api::shard::TenantShardId; -use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use s3_scrubber::scan_metadata::scan_metadata; -use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth}; - -use clap::{Parser, Subcommand}; - -#[derive(Parser)] -#[command(author, version, about, long_about = None)] -#[command(arg_required_else_help(true))] -struct Cli { - #[command(subcommand)] - command: Command, - - #[arg(short, long, default_value_t = false)] - delete: bool, -} - -#[derive(Subcommand, Debug)] -enum Command { - FindGarbage { - #[arg(short, long)] - node_kind: NodeKind, - #[arg(short, long, default_value_t=TraversingDepth::Tenant)] - depth: TraversingDepth, - #[arg(short, long, default_value_t = String::from("garbage.json"))] - output_path: String, - }, - PurgeGarbage { - #[arg(short, long)] - input_path: String, - #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)] - mode: PurgeMode, - }, - ScanMetadata { - #[arg(short, long, default_value_t = false)] - json: bool, - #[arg(long = "tenant-id", num_args = 0..)] - tenant_ids: Vec, - }, -} - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - let cli = Cli::parse(); - - let bucket_config = BucketConfig::from_env()?; - - let command_log_name = match &cli.command { - Command::ScanMetadata { .. } => "scan", - Command::FindGarbage { .. } => "find-garbage", - Command::PurgeGarbage { .. } => "purge-garbage", - }; - let _guard = init_logging(&format!( - "{}_{}_{}_{}.log", - std::env::args().next().unwrap(), - command_log_name, - bucket_config.bucket, - chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") - )); - - match cli.command { - Command::ScanMetadata { json, tenant_ids } => { - match scan_metadata(bucket_config.clone(), tenant_ids).await { - Err(e) => { - tracing::error!("Failed: {e}"); - Err(e) - } - Ok(summary) => { - if json { - println!("{}", serde_json::to_string(&summary).unwrap()) - } else { - println!("{}", summary.summary_string()); - } - if summary.is_fatal() { - Err(anyhow::anyhow!("Fatal scrub errors detected")) - } else if summary.is_empty() { - // Strictly speaking an empty bucket is a valid bucket, but if someone ran the - // scrubber they were likely expecting to scan something, and if we see no timelines - // at all then it's likely due to some configuration issues like a bad prefix - Err(anyhow::anyhow!( - "No timelines found in bucket {} prefix {}", - bucket_config.bucket, - bucket_config - .prefix_in_bucket - .unwrap_or("".to_string()) - )) - } else { - Ok(()) - } - } - } - } - Command::FindGarbage { - node_kind, - depth, - output_path, - } => { - let console_config = ConsoleConfig::from_env()?; - find_garbage(bucket_config, console_config, depth, node_kind, output_path).await - } - Command::PurgeGarbage { input_path, mode } => { - purge_garbage(input_path, mode, !cli.delete).await - } - } -} diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 364cad7892..a650d5e207 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -33,6 +33,7 @@ once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true +rand.workspace = true regex.workspace = true scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } @@ -45,6 +46,7 @@ tokio = { workspace = true, features = ["fs"] } tokio-util = { workspace = true } tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-tar.workspace = true toml_edit.workspace = true tracing.workspace = true url.workspace = true @@ -61,3 +63,10 @@ tokio-stream.workspace = true utils.workspace = true workspace_hack.workspace = true + +[dev-dependencies] +walproposer.workspace = true +rand.workspace = true +desim.workspace = true +tracing.workspace = true +tracing-subscriber = { workspace = true, features = ["json"] } diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index bf4905aaa7..dd9058c468 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -12,8 +12,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } Ok(()) } - (Scope::PageServerApi, _) => Err(AuthError( - "PageServerApi scope makes no sense for Safekeeper".into(), + (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Safekeeper auth", + claims.scope + ) + .into(), )), (Scope::SafekeeperData, _) => Ok(()), } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 33047051df..86238c7292 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -13,14 +13,15 @@ use tokio::runtime::Handle; use tokio::signal::unix::{signal, SignalKind}; use tokio::task::JoinError; use toml_edit::Document; +use utils::logging::SecretString; +use std::env::{var, VarError}; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use storage_broker::Uri; -use tokio::sync::mpsc; use tracing::*; use utils::pid_file; @@ -28,15 +29,14 @@ use utils::pid_file; use metrics::set_build_info_metric; use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, - DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; +use safekeeper::http; use safekeeper::wal_service; use safekeeper::GlobalTimelines; use safekeeper::SafeKeeperConf; use safekeeper::{broker, WAL_SERVICE_RUNTIME}; use safekeeper::{control_file, BROKER_RUNTIME}; -use safekeeper::{http, WAL_REMOVER_RUNTIME}; -use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME}; use safekeeper::{wal_backup, HTTP_RUNTIME}; use storage_broker::DEFAULT_ENDPOINT; use utils::auth::{JwtAuth, Scope, SwappableJwtAuth}; @@ -166,6 +166,21 @@ struct Args { /// useful for debugging. #[arg(long)] current_thread_runtime: bool, + /// Keep horizon for walsenders, i.e. don't remove WAL segments that are + /// still needed for existing replication connection. + #[arg(long)] + walsenders_keep_horizon: bool, + /// Enable partial backup. If disabled, safekeeper will not upload partial + /// segments to remote storage. + #[arg(long)] + partial_backup_enabled: bool, + /// Controls how long backup will wait until uploading the partial segment. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] + partial_backup_timeout: Duration, + /// Disable task to push messages to broker every second. Supposed to + /// be used in tests. + #[arg(long)] + disable_periodic_broker_push: bool, } // Like PathBufValueParser, but allows empty string. @@ -274,6 +289,22 @@ async fn main() -> anyhow::Result<()> { } }; + // Load JWT auth token to connect to other safekeepers for pull_timeline. + let sk_auth_token = match var("SAFEKEEPER_AUTH_TOKEN") { + Ok(v) => { + info!("loaded JWT token for authentication with safekeepers"); + Some(SecretString::from(v)) + } + Err(VarError::NotPresent) => { + info!("no JWT token for authentication with safekeepers detected"); + None + } + Err(_) => { + warn!("JWT token for authentication with safekeepers is not unicode"); + None + } + }; + let conf = SafeKeeperConf { workdir, my_id: id, @@ -294,7 +325,12 @@ async fn main() -> anyhow::Result<()> { pg_auth, pg_tenant_only_auth, http_auth, + sk_auth_token, current_thread_runtime: args.current_thread_runtime, + walsenders_keep_horizon: args.walsenders_keep_horizon, + partial_backup_enabled: args.partial_backup_enabled, + partial_backup_timeout: args.partial_backup_timeout, + disable_periodic_broker_push: args.disable_periodic_broker_push, }; // initialize sentry if SENTRY_DSN is provided @@ -358,7 +394,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + wal_backup::init_remote_storage(&conf); // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = @@ -370,19 +406,9 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let current_thread_rt = conf .current_thread_runtime .then(|| Handle::try_current().expect("no runtime in main")); - let conf_ = conf.clone(); - let wal_backup_handle = current_thread_rt - .as_ref() - .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle()) - .spawn(wal_backup::wal_backup_launcher_task_main( - conf_, - wal_backup_launcher_rx, - )) - .map(|res| ("WAL backup launcher".to_owned(), res)); - tasks_handles.push(Box::pin(wal_backup_handle)); // Load all timelines from disk to memory. - GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?; + GlobalTimelines::init(conf.clone()).await?; let conf_ = conf.clone(); // Run everything in current thread rt, if asked. @@ -433,14 +459,6 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { .map(|res| ("broker main".to_owned(), res)); tasks_handles.push(Box::pin(broker_task_handle)); - let conf_ = conf.clone(); - let wal_remover_handle = current_thread_rt - .as_ref() - .unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle()) - .spawn(remove_wal::task_main(conf_)) - .map(|res| ("WAL remover".to_owned(), res)); - tasks_handles.push(Box::pin(wal_remover_handle)); - set_build_info_metric(GIT_VERSION, BUILD_TAG); // TODO: update tokio-stream, convert to real async Stream with diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs index 2b1db2714b..7cc2142291 100644 --- a/safekeeper/src/broker.rs +++ b/safekeeper/src/broker.rs @@ -10,11 +10,20 @@ use anyhow::Result; use storage_broker::parse_proto_ttid; use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey; +use storage_broker::proto::FilterTenantTimelineId; +use storage_broker::proto::MessageType; +use storage_broker::proto::SafekeeperDiscoveryResponse; +use storage_broker::proto::SubscribeByFilterRequest; use storage_broker::proto::SubscribeSafekeeperInfoRequest; +use storage_broker::proto::TypeSubscription; +use storage_broker::proto::TypedMessage; use storage_broker::Request; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; use std::time::Duration; use std::time::Instant; +use std::time::UNIX_EPOCH; use tokio::task::JoinHandle; use tokio::time::sleep; use tracing::*; @@ -31,6 +40,14 @@ const PUSH_INTERVAL_MSEC: u64 = 1000; /// Push once in a while data about all active timelines to the broker. async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { + if conf.disable_periodic_broker_push { + info!("broker push_loop is disabled, doing nothing..."); + futures::future::pending::<()>().await; // sleep forever + return Ok(()); + } + + let active_timelines_set = GlobalTimelines::get_global_broker_active_set(); + let mut client = storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC); @@ -42,15 +59,9 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { // sensitive and there is no risk of deadlock as we don't await while // lock is held. let now = Instant::now(); - let all_tlis = GlobalTimelines::get_all(); + let all_tlis = active_timelines_set.get_all(); let mut n_pushed_tlis = 0; for tli in &all_tlis { - // filtering alternative futures::stream::iter(all_tlis) - // .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::>().await; - // doesn't look better, and I'm not sure how to do that without collect. - if !tli.is_active().await { - continue; - } let sk_info = tli.get_safekeeper_info(&conf).await; yield sk_info; BROKER_PUSHED_UPDATES.inc(); @@ -75,7 +86,8 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> { } /// Subscribe and fetch all the interesting data from the broker. -async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { +#[instrument(name = "broker pull", skip_all)] +async fn pull_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; // TODO: subscribe only to local timelines instead of all @@ -94,6 +106,8 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { let err_counter = BROKER_PULLED_UPDATES.with_label_values(&["error"]); while let Some(msg) = stream.message().await? { + stats.update_pulled(); + let proto_ttid = msg .tenant_timeline_id .as_ref() @@ -119,12 +133,94 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> { bail!("end of stream"); } +/// Process incoming discover requests. This is done in a separate task to avoid +/// interfering with the normal pull/push loops. +async fn discover_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { + let mut client = + storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?; + + let request = SubscribeByFilterRequest { + types: vec![TypeSubscription { + r#type: MessageType::SafekeeperDiscoveryRequest as i32, + }], + tenant_timeline_id: Some(FilterTenantTimelineId { + enabled: false, + tenant_timeline_id: None, + }), + }; + + let mut stream = client + .subscribe_by_filter(request) + .await + .context("subscribe_by_filter request failed")? + .into_inner(); + + let discover_counter = BROKER_PULLED_UPDATES.with_label_values(&["discover"]); + + while let Some(typed_msg) = stream.message().await? { + stats.update_pulled(); + + match typed_msg.r#type() { + MessageType::SafekeeperDiscoveryRequest => { + let msg = typed_msg + .safekeeper_discovery_request + .expect("proto type mismatch from broker message"); + + let proto_ttid = msg + .tenant_timeline_id + .as_ref() + .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?; + let ttid = parse_proto_ttid(proto_ttid)?; + if let Ok(tli) = GlobalTimelines::get(ttid) { + // we received a discovery request for a timeline we know about + discover_counter.inc(); + + // create and reply with discovery response + let sk_info = tli.get_safekeeper_info(&conf).await; + let response = SafekeeperDiscoveryResponse { + safekeeper_id: sk_info.safekeeper_id, + tenant_timeline_id: sk_info.tenant_timeline_id, + commit_lsn: sk_info.commit_lsn, + safekeeper_connstr: sk_info.safekeeper_connstr, + availability_zone: sk_info.availability_zone, + standby_horizon: 0, + }; + + // note this is a blocking call + client + .publish_one(TypedMessage { + r#type: MessageType::SafekeeperDiscoveryResponse as i32, + safekeeper_timeline_info: None, + safekeeper_discovery_request: None, + safekeeper_discovery_response: Some(response), + }) + .await?; + } + } + + _ => { + warn!( + "unexpected message type i32 {}, {:?}", + typed_msg.r#type, + typed_msg.r#type() + ); + } + } + } + bail!("end of stream"); +} + pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { info!("started, broker endpoint {:?}", conf.broker_endpoint); let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC)); let mut push_handle: Option>> = None; let mut pull_handle: Option>> = None; + let mut discover_handle: Option>> = None; + + let stats = Arc::new(BrokerStats::new()); + let stats_task = task_stats(stats.clone()); + tokio::pin!(stats_task); // Selecting on JoinHandles requires some squats; is there a better way to // reap tasks individually? @@ -153,13 +249,77 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { }; pull_handle = None; }, + res = async { discover_handle.as_mut().unwrap().await }, if discover_handle.is_some() => { + // was it panic or normal error? + match res { + Ok(res_internal) => if let Err(err_inner) = res_internal { + warn!("discover task failed: {:?}", err_inner); + } + Err(err_outer) => { warn!("discover task panicked: {:?}", err_outer) } + }; + discover_handle = None; + }, _ = ticker.tick() => { if push_handle.is_none() { push_handle = Some(tokio::spawn(push_loop(conf.clone()))); } if pull_handle.is_none() { - pull_handle = Some(tokio::spawn(pull_loop(conf.clone()))); + pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone()))); } + if discover_handle.is_none() { + discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone()))); + } + }, + _ = &mut stats_task => {} + } + } +} + +struct BrokerStats { + /// Timestamp of the last received message from the broker. + last_pulled_ts: AtomicU64, +} + +impl BrokerStats { + fn new() -> Self { + BrokerStats { + last_pulled_ts: AtomicU64::new(0), + } + } + + fn now_millis() -> u64 { + std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time is before epoch") + .as_millis() as u64 + } + + /// Update last_pulled timestamp to current time. + fn update_pulled(&self) { + self.last_pulled_ts + .store(Self::now_millis(), std::sync::atomic::Ordering::Relaxed); + } +} + +/// Periodically write to logs if there are issues with receiving data from the broker. +async fn task_stats(stats: Arc) { + let warn_duration = Duration::from_secs(10); + let mut ticker = tokio::time::interval(warn_duration); + + loop { + tokio::select! { + _ = ticker.tick() => { + let last_pulled = stats.last_pulled_ts.load(std::sync::atomic::Ordering::SeqCst); + if last_pulled == 0 { + // no broker updates yet + continue; + } + + let now = BrokerStats::now_millis(); + if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 { + let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp"); + info!("no broker updates for some time, last update: {:?}", ts); + } } } } diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 591bfea182..9d65187350 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -2,26 +2,28 @@ use anyhow::{bail, ensure, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use camino::Utf8PathBuf; -use tokio::fs::{self, File}; +use camino::{Utf8Path, Utf8PathBuf}; +use tokio::fs::File; use tokio::io::AsyncWriteExt; +use utils::crashsafe::durable_rename; use std::io::Read; use std::ops::Deref; use std::path::Path; use std::time::Instant; -use crate::control_file_upgrade::upgrade_control_file; use crate::metrics::PERSIST_CONTROL_FILE_SECONDS; -use crate::safekeeper::{SafeKeeperState, SK_FORMAT_VERSION, SK_MAGIC}; +use crate::state::TimelinePersistentState; +use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir}; use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; -use std::convert::TryInto; +pub const SK_MAGIC: u32 = 0xcafeceefu32; +pub const SK_FORMAT_VERSION: u32 = 8; // contains persistent metadata for safekeeper -const CONTROL_FILE_NAME: &str = "safekeeper.control"; +pub const CONTROL_FILE_NAME: &str = "safekeeper.control"; // needed to atomically update the state using `rename` const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); @@ -29,9 +31,9 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. #[async_trait::async_trait] -pub trait Storage: Deref { +pub trait Storage: Deref { /// Persist safekeeper state on disk and update internal state. - async fn persist(&mut self, s: &SafeKeeperState) -> Result<()>; + async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()>; /// Timestamp of last persist. fn last_persist_at(&self) -> Instant; @@ -41,10 +43,10 @@ pub trait Storage: Deref { pub struct FileStorage { // save timeline dir to avoid reconstructing it every time timeline_dir: Utf8PathBuf, - conf: SafeKeeperConf, + no_sync: bool, /// Last state persisted to disk. - state: SafeKeeperState, + state: TimelinePersistentState, /// Not preserved across restarts. last_persist_at: Instant, } @@ -52,13 +54,12 @@ pub struct FileStorage { impl FileStorage { /// Initialize storage by loading state from disk. pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result { - let timeline_dir = conf.timeline_dir(ttid); - - let state = Self::load_control_file_conf(conf, ttid)?; + let timeline_dir = get_timeline_dir(conf, ttid); + let state = Self::load_control_file_from_dir(&timeline_dir)?; Ok(FileStorage { timeline_dir, - conf: conf.clone(), + no_sync: conf.no_sync, state, last_persist_at: Instant::now(), }) @@ -68,11 +69,11 @@ impl FileStorage { pub fn create_new( timeline_dir: Utf8PathBuf, conf: &SafeKeeperConf, - state: SafeKeeperState, + state: TimelinePersistentState, ) -> Result { let store = FileStorage { timeline_dir, - conf: conf.clone(), + no_sync: conf.no_sync, state, last_persist_at: Instant::now(), }; @@ -81,7 +82,7 @@ impl FileStorage { } /// Check the magic/version in the on-disk data and deserialize it, if possible. - fn deser_sk_state(buf: &mut &[u8]) -> Result { + fn deser_sk_state(buf: &mut &[u8]) -> Result { // Read the version independent part let magic = ReadBytesExt::read_u32::(buf)?; if magic != SK_MAGIC { @@ -93,24 +94,23 @@ impl FileStorage { } let version = ReadBytesExt::read_u32::(buf)?; if version == SK_FORMAT_VERSION { - let res = SafeKeeperState::des(buf)?; + let res = TimelinePersistentState::des(buf)?; return Ok(res); } // try to upgrade upgrade_control_file(buf, version) } - /// Load control file for given ttid at path specified by conf. - pub fn load_control_file_conf( - conf: &SafeKeeperConf, - ttid: &TenantTimelineId, - ) -> Result { - let path = conf.timeline_dir(ttid).join(CONTROL_FILE_NAME); + /// Load control file from given directory. + pub fn load_control_file_from_dir(timeline_dir: &Utf8Path) -> Result { + let path = timeline_dir.join(CONTROL_FILE_NAME); Self::load_control_file(path) } /// Read in the control file. - pub fn load_control_file>(control_file_path: P) -> Result { + pub fn load_control_file>( + control_file_path: P, + ) -> Result { let mut control_file = std::fs::OpenOptions::new() .read(true) .write(true) @@ -153,7 +153,7 @@ impl FileStorage { } impl Deref for FileStorage { - type Target = SafeKeeperState; + type Target = TimelinePersistentState; fn deref(&self) -> &Self::Target { &self.state @@ -165,7 +165,7 @@ impl Storage for FileStorage { /// Persists state durably to the underlying storage. /// /// For a description, see . - async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { + async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer(); // write data to safekeeper.control.partial @@ -198,35 +198,8 @@ impl Storage for FileStorage { ) })?; - // fsync the file - if !self.conf.no_sync { - control_partial.sync_all().await.with_context(|| { - format!( - "failed to sync partial control file at {}", - control_partial_path - ) - })?; - } - let control_path = self.timeline_dir.join(CONTROL_FILE_NAME); - - // rename should be atomic - fs::rename(&control_partial_path, &control_path).await?; - // this sync is not required by any standard but postgres does this (see durable_rename) - if !self.conf.no_sync { - let new_f = File::open(&control_path).await?; - new_f - .sync_all() - .await - .with_context(|| format!("failed to sync control file at: {}", &control_path))?; - - // fsync the directory (linux specific) - let tli_dir = File::open(&self.timeline_dir).await?; - tli_dir - .sync_all() - .await - .context("failed to sync control file directory")?; - } + durable_rename(&control_partial_path, &control_path, !self.no_sync).await?; // update internal state self.state = s.clone(); @@ -240,11 +213,9 @@ impl Storage for FileStorage { #[cfg(test)] mod test { - use super::FileStorage; use super::*; - use crate::{safekeeper::SafeKeeperState, SafeKeeperConf}; - use anyhow::Result; - use utils::{id::TenantTimelineId, lsn::Lsn}; + use tokio::fs; + use utils::lsn::Lsn; fn stub_conf() -> SafeKeeperConf { let workdir = camino_tempfile::tempdir().unwrap().into_path(); @@ -257,25 +228,26 @@ mod test { async fn load_from_control_file( conf: &SafeKeeperConf, ttid: &TenantTimelineId, - ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(conf.timeline_dir(ttid)) + ) -> Result<(FileStorage, TimelinePersistentState)> { + let timeline_dir = get_timeline_dir(conf, ttid); + fs::create_dir_all(&timeline_dir) .await .expect("failed to create timeline dir"); Ok(( FileStorage::restore_new(ttid, conf)?, - FileStorage::load_control_file_conf(conf, ttid)?, + FileStorage::load_control_file_from_dir(&timeline_dir)?, )) } async fn create( conf: &SafeKeeperConf, ttid: &TenantTimelineId, - ) -> Result<(FileStorage, SafeKeeperState)> { - fs::create_dir_all(conf.timeline_dir(ttid)) + ) -> Result<(FileStorage, TimelinePersistentState)> { + let timeline_dir = get_timeline_dir(conf, ttid); + fs::create_dir_all(&timeline_dir) .await .expect("failed to create timeline dir"); - let state = SafeKeeperState::empty(); - let timeline_dir = conf.timeline_dir(ttid); + let state = TimelinePersistentState::empty(); let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?; Ok((storage, state)) } @@ -316,7 +288,7 @@ mod test { .await .expect("failed to persist state"); } - let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME); + let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME); let mut data = fs::read(&control_path).await.unwrap(); data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data) diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index a0be2b2054..8f4dfe9b43 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -1,6 +1,8 @@ //! Code to deal with safekeeper control file upgrades -use crate::safekeeper::{ - AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermLsn, +use crate::{ + safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, + state::{PersistedPeers, TimelinePersistentState}, + wal_backup_partial, }; use anyhow::{bail, Result}; use pq_proto::SystemId; @@ -137,7 +139,51 @@ pub struct SafeKeeperStateV4 { pub peers: PersistedPeers, } -pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafeKeeperStateV7 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: PersistedPeers, +} + +pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { info!("reading safekeeper control file version {}", version); @@ -149,7 +195,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result lsn: Lsn(0), }]), }; - return Ok(SafeKeeperState { + return Ok(TimelinePersistentState { tenant_id: oldstate.server.tenant_id, timeline_id: oldstate.server.timeline_id, acceptor_state: ac, @@ -166,6 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: PersistedPeers(vec![]), + partial_backup: wal_backup_partial::State::default(), }); // migrate to hexing some ids } else if version == 2 { @@ -176,7 +223,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result system_id: oldstate.server.system_id, wal_seg_size: oldstate.server.wal_seg_size, }; - return Ok(SafeKeeperState { + return Ok(TimelinePersistentState { tenant_id: oldstate.server.tenant_id, timeline_id: oldstate.server.timeline_id, acceptor_state: oldstate.acceptor_state, @@ -189,6 +236,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: PersistedPeers(vec![]), + partial_backup: wal_backup_partial::State::default(), }); // migrate to moving tenant_id/timeline_id to the top and adding some lsns } else if version == 3 { @@ -199,7 +247,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result system_id: oldstate.server.system_id, wal_seg_size: oldstate.server.wal_seg_size, }; - return Ok(SafeKeeperState { + return Ok(TimelinePersistentState { tenant_id: oldstate.server.tenant_id, timeline_id: oldstate.server.timeline_id, acceptor_state: oldstate.acceptor_state, @@ -212,6 +260,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result peer_horizon_lsn: oldstate.truncate_lsn, remote_consistent_lsn: Lsn(0), peers: PersistedPeers(vec![]), + partial_backup: wal_backup_partial::State::default(), }); // migrate to having timeline_start_lsn } else if version == 4 { @@ -222,7 +271,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result system_id: oldstate.server.system_id, wal_seg_size: oldstate.server.wal_seg_size, }; - return Ok(SafeKeeperState { + return Ok(TimelinePersistentState { tenant_id: oldstate.tenant_id, timeline_id: oldstate.timeline_id, acceptor_state: oldstate.acceptor_state, @@ -235,10 +284,11 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result peer_horizon_lsn: oldstate.peer_horizon_lsn, remote_consistent_lsn: Lsn(0), peers: PersistedPeers(vec![]), + partial_backup: wal_backup_partial::State::default(), }); } else if version == 5 { info!("reading safekeeper control file version {}", version); - let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + let mut oldstate = TimelinePersistentState::des(&buf[..buf.len()])?; if oldstate.timeline_start_lsn != Lsn(0) { return Ok(oldstate); } @@ -251,7 +301,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result return Ok(oldstate); } else if version == 6 { info!("reading safekeeper control file version {}", version); - let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?; + let mut oldstate = TimelinePersistentState::des(&buf[..buf.len()])?; if oldstate.server.pg_version != 0 { return Ok(oldstate); } @@ -261,7 +311,30 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result oldstate.server.pg_version = 140005; return Ok(oldstate); + } else if version == 7 { + info!("reading safekeeper control file version {}", version); + let oldstate = SafeKeeperStateV7::des(&buf[..buf.len()])?; + + return Ok(TimelinePersistentState { + tenant_id: oldstate.tenant_id, + timeline_id: oldstate.timeline_id, + acceptor_state: oldstate.acceptor_state, + server: oldstate.server, + proposer_uuid: oldstate.proposer_uuid, + timeline_start_lsn: oldstate.timeline_start_lsn, + local_start_lsn: oldstate.local_start_lsn, + commit_lsn: oldstate.commit_lsn, + backup_lsn: oldstate.backup_lsn, + peer_horizon_lsn: oldstate.peer_horizon_lsn, + remote_consistent_lsn: oldstate.remote_consistent_lsn, + peers: oldstate.peers, + partial_backup: wal_backup_partial::State::default(), + }); } + + // TODO: persist the file back to the disk after upgrade + // TODO: think about backward compatibility and rollbacks + bail!("unsupported safekeeper control file version {}", version) } diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index ef88eb27e3..51cf4db6b5 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -14,11 +14,11 @@ use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ control_file::{FileStorage, Storage}, pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline}, - safekeeper::SafeKeeperState, - timeline::{Timeline, TimelineError}, + state::TimelinePersistentState, + timeline::{FullAccessTimeline, Timeline, TimelineError}, wal_backup::copy_s3_segments, wal_storage::{wal_file_paths, WalReader}, - GlobalTimelines, SafeKeeperConf, + GlobalTimelines, }; // we don't want to have more than 10 segments on disk after copy, because they take space @@ -46,12 +46,14 @@ pub async fn handle_request(request: Request) -> Result<()> { } } + let source_tli = request.source.full_access_guard().await?; + let conf = &GlobalTimelines::get_global_config(); let ttid = request.destination_ttid; let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; - let (mem_state, state) = request.source.get_state().await; + let (mem_state, state) = source_tli.get_state().await; let start_lsn = state.timeline_start_lsn; if start_lsn == Lsn::INVALID { bail!("timeline is not initialized"); @@ -60,7 +62,7 @@ pub async fn handle_request(request: Request) -> Result<()> { { let commit_lsn = mem_state.commit_lsn; - let flush_lsn = request.source.get_flush_lsn().await; + let flush_lsn = source_tli.get_flush_lsn().await; info!( "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}", @@ -127,17 +129,15 @@ pub async fn handle_request(request: Request) -> Result<()> { .await?; copy_disk_segments( - conf, - &state, + &source_tli, wal_seg_size, - &request.source.ttid, new_backup_lsn, request.until_lsn, &tli_dir_path, ) .await?; - let mut new_state = SafeKeeperState::new( + let mut new_state = TimelinePersistentState::new( &request.destination_ttid, state.server.clone(), vec![], @@ -159,21 +159,13 @@ pub async fn handle_request(request: Request) -> Result<()> { } async fn copy_disk_segments( - conf: &SafeKeeperConf, - persisted_state: &SafeKeeperState, + tli: &FullAccessTimeline, wal_seg_size: usize, - source_ttid: &TenantTimelineId, start_lsn: Lsn, end_lsn: Lsn, tli_dir_path: &Utf8PathBuf, ) -> Result<()> { - let mut wal_reader = WalReader::new( - conf.workdir.clone(), - conf.timeline_dir(source_ttid), - persisted_state, - start_lsn, - true, - )?; + let mut wal_reader = tli.get_walreader(start_lsn).await?; let mut buf = [0u8; MAX_SEND_SIZE]; @@ -225,6 +217,7 @@ async fn write_segment( assert!(from <= to); assert!(to <= wal_seg_size); + #[allow(clippy::suspicious_open_options)] let mut file = OpenOptions::new() .create(true) .write(true) diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index c9ff1afdea..062ff4b3db 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -10,6 +10,7 @@ use std::sync::Arc; use anyhow::bail; use anyhow::Result; use camino::Utf8Path; +use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use postgres_ffi::XLogSegNo; use postgres_ffi::MAX_SEND_SIZE; @@ -22,14 +23,14 @@ use utils::id::TenantTimelineId; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; -use crate::safekeeper::SafeKeeperState; -use crate::safekeeper::SafekeeperMemState; use crate::safekeeper::TermHistory; -use crate::SafeKeeperConf; - use crate::send_wal::WalSenderState; -use crate::wal_storage::WalReader; +use crate::state::TimelineMemState; +use crate::state::TimelinePersistentState; +use crate::timeline::get_timeline_dir; +use crate::timeline::FullAccessTimeline; use crate::GlobalTimelines; +use crate::SafeKeeperConf; /// Various filters that influence the resulting JSON output. #[derive(Debug, Serialize, Deserialize, Clone)] @@ -69,6 +70,7 @@ pub struct Response { pub struct TimelineDumpSer { pub tli: Arc, pub args: Args, + pub timeline_dir: Utf8PathBuf, pub runtime: Arc, } @@ -86,14 +88,20 @@ impl Serialize for TimelineDumpSer { where S: serde::Serializer, { - let dump = self - .runtime - .block_on(build_from_tli_dump(self.tli.clone(), self.args.clone())); + let dump = self.runtime.block_on(build_from_tli_dump( + &self.tli, + &self.args, + &self.timeline_dir, + )); dump.serialize(serializer) } } -async fn build_from_tli_dump(timeline: Arc, args: Args) -> Timeline { +async fn build_from_tli_dump( + timeline: &Arc, + args: &Args, + timeline_dir: &Utf8Path, +) -> Timeline { let control_file = if args.dump_control_file { let mut state = timeline.get_state().await.1; if !args.dump_term_history { @@ -113,7 +121,8 @@ async fn build_from_tli_dump(timeline: Arc, args: Arg let disk_content = if args.dump_disk_content { // build_disk_content can fail, but we don't want to fail the whole // request because of that. - build_disk_content(&timeline.timeline_dir).ok() + // Note: timeline can be in offloaded state, this is not a problem. + build_disk_content(timeline_dir).ok() } else { None }; @@ -143,7 +152,7 @@ pub struct Config { pub struct Timeline { pub tenant_id: TenantId, pub timeline_id: TimelineId, - pub control_file: Option, + pub control_file: Option, pub memory: Option, pub disk_content: Option, } @@ -158,7 +167,7 @@ pub struct Memory { pub num_computes: u32, pub last_removed_segno: XLogSegNo, pub epoch_start_lsn: Lsn, - pub mem_state: SafekeeperMemState, + pub mem_state: TimelineMemState, // PhysicalStorage state. pub write_lsn: Lsn, @@ -187,6 +196,7 @@ pub struct FileInfo { pub async fn build(args: Args) -> Result { let start_time = Utc::now(); let timelines_count = GlobalTimelines::timelines_count(); + let config = GlobalTimelines::get_global_config(); let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() { // If both tenant_id and timeline_id are specified, we can just get the @@ -224,12 +234,11 @@ pub async fn build(args: Args) -> Result { timelines.push(TimelineDumpSer { tli, args: args.clone(), + timeline_dir: get_timeline_dir(&config, &ttid), runtime: runtime.clone(), }); } - let config = GlobalTimelines::get_global_config(); - Ok(Response { start_time, finish_time: Utc::now(), @@ -317,27 +326,19 @@ pub struct TimelineDigest { } pub async fn calculate_digest( - tli: &Arc, + tli: &FullAccessTimeline, request: TimelineDigestRequest, ) -> Result { if request.from_lsn > request.until_lsn { bail!("from_lsn is greater than until_lsn"); } - let conf = GlobalTimelines::get_global_config(); let (_, persisted_state) = tli.get_state().await; - if persisted_state.timeline_start_lsn > request.from_lsn { bail!("requested LSN is before the start of the timeline"); } - let mut wal_reader = WalReader::new( - conf.workdir.clone(), - tli.timeline_dir.clone(), - &persisted_state, - request.from_lsn, - true, - )?; + let mut wal_reader = tli.get_walreader(request.from_lsn).await?; let mut hasher = Sha256::new(); let mut buf = [0u8; MAX_SEND_SIZE]; diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 761541168c..f45bfb95fa 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -2,8 +2,7 @@ //! protocol commands. use anyhow::Context; -use std::str::FromStr; -use std::str::{self}; +use std::str::{self, FromStr}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{debug, info, info_span, Instrument}; @@ -16,8 +15,8 @@ use crate::safekeeper::Term; use crate::timeline::TimelineError; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; +use postgres_backend::PostgresBackend; use postgres_backend::QueryError; -use postgres_backend::{self, PostgresBackend}; use postgres_ffi::PG_TLI; use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID}; use regex::Regex; diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs new file mode 100644 index 0000000000..0bb31c200d --- /dev/null +++ b/safekeeper/src/http/client.rs @@ -0,0 +1,139 @@ +//! Safekeeper http client. +//! +//! Partially copied from pageserver client; some parts might be better to be +//! united. +//! +//! It would be also good to move it out to separate crate, but this needs +//! duplication of internal-but-reported structs like WalSenderState, ServerInfo +//! etc. + +use reqwest::{IntoUrl, Method, StatusCode}; +use utils::{ + http::error::HttpErrorBody, + id::{TenantId, TimelineId}, + logging::SecretString, +}; + +use super::routes::TimelineStatus; + +#[derive(Debug, Clone)] +pub struct Client { + mgmt_api_endpoint: String, + authorization_header: Option, + client: reqwest::Client, +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + /// Failed to receive body (reqwest error). + #[error("receive body: {0}")] + ReceiveBody(reqwest::Error), + + /// Status is not ok, but failed to parse body as `HttpErrorBody`. + #[error("receive error body: {0}")] + ReceiveErrorBody(String), + + /// Status is not ok; parsed error in body as `HttpErrorBody`. + #[error("safekeeper API: {1}")] + ApiError(StatusCode, String), +} + +pub type Result = std::result::Result; + +pub trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> impl std::future::Future> + Send; +} + +/// If status is not ok, try to extract error message from the body. +impl ResponseErrorMessageExt for reqwest::Response { + async fn error_from_body(self) -> Result { + let status = self.status(); + if !(status.is_client_error() || status.is_server_error()) { + return Ok(self); + } + + let url = self.url().to_owned(); + Err(match self.json::().await { + Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg), + Err(_) => { + Error::ReceiveErrorBody(format!("http error ({}) at {}.", status.as_u16(), url)) + } + }) + } +} + +impl Client { + pub fn new(mgmt_api_endpoint: String, jwt: Option) -> Self { + Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt) + } + + pub fn from_client( + client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option, + ) -> Self { + Self { + mgmt_api_endpoint, + authorization_header: jwt + .map(|jwt| SecretString::from(format!("Bearer {}", jwt.get_contents()))), + client, + } + } + + pub async fn timeline_status( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.get(&uri).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + + pub async fn snapshot( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/snapshot", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + self.get(&uri).await + } + + async fn get(&self, uri: U) -> Result { + self.request(Method::GET, uri, ()).await + } + + /// Send the request and check that the status code is good. + async fn request( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let res = self.request_noerror(method, uri, body).await?; + let response = res.error_from_body().await?; + Ok(response) + } + + /// Just send the request. + async fn request_noerror( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + let req = self.client.request(method, uri); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value.get_contents()) + } else { + req + }; + req.json(&body).send().await.map_err(Error::ReceiveBody) + } +} diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 2a9570595f..52fb13ff5b 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -1,3 +1,4 @@ +pub mod client; pub mod routes; pub use routes::make_router; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 5283ea19c1..3f2cd97ccd 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -1,38 +1,25 @@ use hyper::{Body, Request, Response, StatusCode, Uri}; - use once_cell::sync::Lazy; -use postgres_ffi::WAL_SEGMENT_SIZE; -use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::fmt; +use std::io::Write as _; use std::str::FromStr; use std::sync::Arc; use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; -use tokio::fs::File; -use tokio::io::AsyncReadExt; +use tokio::sync::mpsc; +use tokio::task; +use tokio_stream::wrappers::ReceiverStream; use tokio_util::sync::CancellationToken; +use tracing::{info_span, Instrument}; use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::{prometheus_metrics_handler, request_span, ChannelWriter}; use utils::http::request::parse_query_param; -use std::io::Write as _; -use tokio::sync::mpsc; -use tokio_stream::wrappers::ReceiverStream; -use tracing::{info_span, Instrument}; -use utils::http::endpoint::{request_span, ChannelWriter}; - -use crate::debug_dump::TimelineDigestRequest; -use crate::receive_wal::WalReceiverState; -use crate::safekeeper::Term; -use crate::safekeeper::{ServerInfo, TermLsn}; -use crate::send_wal::WalSenderState; -use crate::timeline::PeerInfo; -use crate::{copy_timeline, debug_dump, pull_timeline}; - -use crate::timelines_global_map::TimelineDeleteForceResult; -use crate::GlobalTimelines; -use crate::SafeKeeperConf; +use postgres_ffi::WAL_SEGMENT_SIZE; +use safekeeper_api::models::TimelineCreateRequest; +use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest}; use utils::{ auth::SwappableJwtAuth, http::{ @@ -46,7 +33,16 @@ use utils::{ lsn::Lsn, }; -use super::models::TimelineCreateRequest; +use crate::debug_dump::TimelineDigestRequest; +use crate::receive_wal::WalReceiverState; +use crate::safekeeper::Term; +use crate::safekeeper::{ServerInfo, TermLsn}; +use crate::send_wal::WalSenderState; +use crate::timeline::PeerInfo; +use crate::timelines_global_map::TimelineDeleteForceResult; +use crate::GlobalTimelines; +use crate::SafeKeeperConf; +use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline}; #[derive(Debug, Serialize)] struct SafekeeperStatus { @@ -85,11 +81,11 @@ impl From for TermLsn { } } -/// Augment AcceptorState with epoch for convenience +/// Augment AcceptorState with last_log_term for convenience #[derive(Debug, Serialize, Deserialize)] pub struct AcceptorStateStatus { pub term: Term, - pub epoch: Term, + pub epoch: Term, // aka last_log_term pub term_history: Vec, } @@ -130,7 +126,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result) -> Result) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + // Note: with evicted timelines it should work better then de-evict them and + // stream; probably start_snapshot would copy partial s3 file to dest path + // and stream control file, or return FullAccessTimeline if timeline is not + // evicted. + let tli = tli + .full_access_guard() + .await + .map_err(ApiError::InternalServerError)?; + + // To stream the body use wrap_stream which wants Stream of Result, + // so create the chan and write to it in another task. + let (tx, rx) = mpsc::channel(1); + + task::spawn(pull_timeline::stream_snapshot(tli, tx)); + + let rx_stream = ReceiverStream::new(rx); + let body = Body::wrap_stream(rx_stream); + + let response = Response::builder() + .status(200) + .header(hyper::header::CONTENT_TYPE, "application/octet-stream") + .body(body) + .unwrap(); + + Ok(response) +} + async fn timeline_copy_handler(mut request: Request) -> Result, ApiError> { check_permission(&request, None)?; @@ -249,6 +282,10 @@ async fn timeline_digest_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { +/// Force persist control file. +async fn timeline_checkpoint_handler(request: Request) -> Result, ApiError> { + check_permission(&request, None)?; + let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); - check_permission(&request, Some(ttid.tenant_id))?; - let filename: String = parse_request_param(&request, "filename")?; - - let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; - - let filepath = tli.timeline_dir.join(filename); - let mut file = File::open(&filepath) + let tli = GlobalTimelines::get(ttid)?; + tli.write_shared_state() .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; - - let mut content = Vec::new(); - // TODO: don't store files in memory - file.read_to_end(&mut content) + .sk + .state + .flush() .await - .map_err(|e| ApiError::InternalServerError(e.into()))?; - - Response::builder() - .status(StatusCode::OK) - .header("Content-Type", "application/octet-stream") - .body(Body::from(content)) - .map_err(|e| ApiError::InternalServerError(e.into())) + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, ()) } /// Deactivates the timeline and removes its data directory. -async fn timeline_delete_force_handler( - mut request: Request, -) -> Result, ApiError> { +async fn timeline_delete_handler(mut request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( parse_request_param(&request, "tenant_id")?, parse_request_param(&request, "timeline_id")?, ); + let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); check_permission(&request, Some(ttid.tenant_id))?; ensure_no_body(&mut request).await?; // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better // error handling here when we're able to. - let resp = GlobalTimelines::delete_force(&ttid) + let resp = GlobalTimelines::delete(&ttid, only_local) .await .map_err(ApiError::InternalServerError)?; json_response(StatusCode::OK, resp) } /// Deactivates all timelines for the tenant and removes its data directory. -/// See `timeline_delete_force_handler`. -async fn tenant_delete_force_handler( - mut request: Request, -) -> Result, ApiError> { +/// See `timeline_delete_handler`. +async fn tenant_delete_handler(mut request: Request) -> Result, ApiError> { let tenant_id = parse_request_param(&request, "tenant_id")?; + let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false); check_permission(&request, Some(tenant_id))?; ensure_no_body(&mut request).await?; // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons; // Using an `InternalServerError` should be fixed when the types support it - let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id) + let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local) .await .map_err(ApiError::InternalServerError)?; json_response( @@ -352,6 +376,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result) -> Result Ok(response) } +async fn patch_control_file_handler( + mut request: Request, +) -> Result, ApiError> { + check_permission(&request, None)?; + + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let patch_request: patch_control_file::Request = json_request(&mut request).await?; + let response = patch_control_file::handle_request(tli, patch_request) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + /// Safekeeper http router. pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let mut router = endpoint::make_router(); @@ -497,9 +542,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder router .data(Arc::new(conf)) .data(auth) + .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { + check_permission(&r, None)?; let cancel = CancellationToken::new(); failpoints_handler(r, cancel).await }) @@ -512,22 +559,30 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder request_span(r, timeline_status_handler) }) .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { - request_span(r, timeline_delete_force_handler) + request_span(r, timeline_delete_handler) }) .delete("/v1/tenant/:tenant_id", |r| { - request_span(r, tenant_delete_force_handler) + request_span(r, tenant_delete_handler) }) + .get( + "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot", + |r| request_span(r, timeline_snapshot_handler), + ) .post("/v1/pull_timeline", |r| { request_span(r, timeline_pull_handler) }) - .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename", - |r| request_span(r, timeline_files_handler), - ) .post( "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy", |r| request_span(r, timeline_copy_handler), ) + .patch( + "/v1/tenant/:tenant_id/timeline/:timeline_id/control_file", + |r| request_span(r, patch_control_file_handler), + ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + |r| request_span(r, timeline_checkpoint_handler), + ) // for tests .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { request_span(r, record_safekeeper_info) diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs index 303bdd67fe..27e54776e0 100644 --- a/safekeeper/src/json_ctrl.rs +++ b/safekeeper/src/json_ctrl.rs @@ -6,8 +6,6 @@ //! modifications in tests. //! -use std::sync::Arc; - use anyhow::Context; use bytes::Bytes; use postgres_backend::QueryError; @@ -21,8 +19,9 @@ use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo}; use crate::safekeeper::{ AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, }; -use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermLsn}; -use crate::timeline::Timeline; +use crate::safekeeper::{Term, TermHistory, TermLsn}; +use crate::state::TimelinePersistentState; +use crate::timeline::FullAccessTimeline; use crate::GlobalTimelines; use postgres_backend::PostgresBackend; use postgres_ffi::encode_logical_message; @@ -56,7 +55,7 @@ pub struct AppendLogicalMessage { #[derive(Debug, Serialize)] struct AppendResult { // safekeeper state after append - state: SafeKeeperState, + state: TimelinePersistentState, // info about new record in the WAL inserted_wal: InsertedWAL, } @@ -103,8 +102,8 @@ pub async fn handle_json_ctrl( async fn prepare_safekeeper( ttid: TenantTimelineId, pg_version: u32, -) -> anyhow::Result> { - GlobalTimelines::create( +) -> anyhow::Result { + let tli = GlobalTimelines::create( ttid, ServerInfo { pg_version, @@ -114,10 +113,16 @@ async fn prepare_safekeeper( Lsn::INVALID, Lsn::INVALID, ) - .await + .await?; + + tli.full_access_guard().await } -async fn send_proposer_elected(tli: &Arc, term: Term, lsn: Lsn) -> anyhow::Result<()> { +async fn send_proposer_elected( + tli: &FullAccessTimeline, + term: Term, + lsn: Lsn, +) -> anyhow::Result<()> { // add new term to existing history let history = tli.get_state().await.1.acceptor_state.term_history; let history = history.up_to(lsn.checked_sub(1u64).unwrap()); @@ -146,7 +151,7 @@ pub struct InsertedWAL { /// Extend local WAL with new LogicalMessage record. To do that, /// create AppendRequest with new WAL and pass it to safekeeper. pub async fn append_logical_message( - tli: &Arc, + tli: &FullAccessTimeline, msg: &AppendLogicalMessage, ) -> anyhow::Result { let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message); @@ -164,7 +169,7 @@ pub async fn append_logical_message( let append_request = ProposerAcceptorMessage::AppendRequest(AppendRequest { h: AppendRequestHeader { term: msg.term, - epoch_start_lsn: begin_lsn, + term_start_lsn: begin_lsn, begin_lsn, end_lsn, commit_lsn, diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index fc5f99eb00..cbd67f0064 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -7,10 +7,7 @@ use tokio::runtime::Runtime; use std::time::Duration; use storage_broker::Uri; -use utils::{ - auth::SwappableJwtAuth, - id::{NodeId, TenantId, TenantTimelineId}, -}; +use utils::{auth::SwappableJwtAuth, id::NodeId, logging::SecretString}; mod auth; pub mod broker; @@ -22,14 +19,19 @@ pub mod handler; pub mod http; pub mod json_ctrl; pub mod metrics; +pub mod patch_control_file; pub mod pull_timeline; pub mod receive_wal; pub mod recovery; pub mod remove_wal; pub mod safekeeper; pub mod send_wal; +pub mod state; pub mod timeline; +pub mod timeline_manager; +pub mod timelines_set; pub mod wal_backup; +pub mod wal_backup_partial; pub mod wal_service; pub mod wal_storage; @@ -46,6 +48,7 @@ pub mod defaults { pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms"; pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); + pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m"; } #[derive(Debug, Clone)] @@ -75,17 +78,18 @@ pub struct SafeKeeperConf { pub pg_auth: Option>, pub pg_tenant_only_auth: Option>, pub http_auth: Option>, + /// JWT token to connect to other safekeepers with. + pub sk_auth_token: Option, pub current_thread_runtime: bool, + pub walsenders_keep_horizon: bool, + pub partial_backup_enabled: bool, + pub partial_backup_timeout: Duration, + pub disable_periodic_broker_push: bool, } impl SafeKeeperConf { - pub fn tenant_dir(&self, tenant_id: &TenantId) -> Utf8PathBuf { - self.workdir.join(tenant_id.to_string()) - } - - pub fn timeline_dir(&self, ttid: &TenantTimelineId) -> Utf8PathBuf { - self.tenant_dir(&ttid.tenant_id) - .join(ttid.timeline_id.to_string()) + pub fn is_wal_backup_enabled(&self) -> bool { + self.remote_storage.is_some() && self.wal_backup_enabled } } @@ -112,9 +116,14 @@ impl SafeKeeperConf { pg_auth: None, pg_tenant_only_auth: None, http_auth: None, + sk_auth_token: None, heartbeat_timeout: Duration::new(5, 0), max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, current_thread_runtime: false, + walsenders_keep_horizon: false, + partial_backup_enabled: false, + partial_backup_timeout: Duration::from_secs(0), + disable_periodic_broker_push: false, } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 11a3f48922..1e965393e3 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -11,8 +11,9 @@ use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge, - IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec, + register_int_counter, register_int_counter_pair, register_int_counter_pair_vec, + register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, + IntGaugeVec, }; use once_cell::sync::Lazy; @@ -21,7 +22,7 @@ use utils::pageserver_feedback::PageserverFeedback; use utils::{id::TenantTimelineId, lsn::Lsn}; use crate::{ - safekeeper::{SafeKeeperState, SafekeeperMemState}, + state::{TimelineMemState, TimelinePersistentState}, GlobalTimelines, }; @@ -110,7 +111,7 @@ pub static REMOVED_WAL_SEGMENTS: Lazy = Lazy::new(|| { pub static BACKED_UP_SEGMENTS: Lazy = Lazy::new(|| { register_int_counter!( "safekeeper_backed_up_segments_total", - "Number of WAL segments backed up to the broker" + "Number of WAL segments backed up to the S3" ) .expect("Failed to register safekeeper_backed_up_segments_total counter") }); @@ -140,6 +141,51 @@ pub static BROKER_ITERATION_TIMELINES: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec") }); +pub static RECEIVED_PS_FEEDBACKS: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_received_ps_feedbacks_total", + "Number of pageserver feedbacks received" + ) + .expect("Failed to register safekeeper_received_ps_feedbacks_total counter") +}); +pub static PARTIAL_BACKUP_UPLOADS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_partial_backup_uploads_total", + "Number of partial backup uploads to the S3", + &["result"] + ) + .expect("Failed to register safekeeper_partial_backup_uploads_total counter") +}); +pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_partial_backup_uploaded_bytes_total", + "Number of bytes uploaded to the S3 during partial backup" + ) + .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter") +}); +pub static MANAGER_ITERATIONS_TOTAL: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_manager_iterations_total", + "Number of iterations of the timeline manager task" + ) + .expect("Failed to register safekeeper_manager_iterations_total counter") +}); +pub static MANAGER_ACTIVE_CHANGES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_manager_active_changes_total", + "Number of timeline active status changes in the timeline manager task" + ) + .expect("Failed to register safekeeper_manager_active_changes_total counter") +}); +pub static WAL_BACKUP_TASKS: Lazy = Lazy::new(|| { + register_int_counter_pair!( + "safekeeper_wal_backup_tasks_started_total", + "Number of active WAL backup tasks", + "safekeeper_wal_backup_tasks_finished_total", + "Number of finished WAL backup tasks", + ) + .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; @@ -301,18 +347,18 @@ pub async fn time_io_closure>( #[derive(Clone)] pub struct FullTimelineInfo { pub ttid: TenantTimelineId, - pub ps_feedback: PageserverFeedback, + pub ps_feedback_count: u64, + pub last_ps_feedback: PageserverFeedback, pub wal_backup_active: bool, pub timeline_is_active: bool, pub num_computes: u32, pub last_removed_segno: XLogSegNo, pub epoch_start_lsn: Lsn, - pub mem_state: SafekeeperMemState, - pub persisted_state: SafeKeeperState, + pub mem_state: TimelineMemState, + pub persisted_state: TimelinePersistentState, pub flush_lsn: Lsn, - pub remote_consistent_lsn: Lsn, pub wal_storage: WalStorageMetrics, } @@ -328,6 +374,7 @@ pub struct TimelineCollector { remote_consistent_lsn: GenericGaugeVec, ps_last_received_lsn: GenericGaugeVec, feedback_last_time_seconds: GenericGaugeVec, + ps_feedback_count: GenericGaugeVec, timeline_active: GenericGaugeVec, wal_backup_active: GenericGaugeVec, connected_computes: IntGaugeVec, @@ -338,6 +385,7 @@ pub struct TimelineCollector { flushed_wal_seconds: GaugeVec, collect_timeline_metrics: Gauge, timelines_count: IntGauge, + active_timelines_count: IntGauge, } impl Default for TimelineCollector { @@ -430,6 +478,15 @@ impl TimelineCollector { .unwrap(); descs.extend(feedback_last_time_seconds.desc().into_iter().cloned()); + let ps_feedback_count = GenericGaugeVec::new( + Opts::new( + "safekeeper_ps_feedback_count_total", + "Number of feedbacks received from the pageserver", + ), + &["tenant_id", "timeline_id"], + ) + .unwrap(); + let timeline_active = GenericGaugeVec::new( Opts::new( "safekeeper_timeline_active", @@ -521,6 +578,13 @@ impl TimelineCollector { .unwrap(); descs.extend(timelines_count.desc().into_iter().cloned()); + let active_timelines_count = IntGauge::new( + "safekeeper_active_timelines", + "Total number of active timelines", + ) + .unwrap(); + descs.extend(active_timelines_count.desc().into_iter().cloned()); + TimelineCollector { descs, commit_lsn, @@ -531,6 +595,7 @@ impl TimelineCollector { remote_consistent_lsn, ps_last_received_lsn, feedback_last_time_seconds, + ps_feedback_count, timeline_active, wal_backup_active, connected_computes, @@ -541,6 +606,7 @@ impl TimelineCollector { flushed_wal_seconds, collect_timeline_metrics, timelines_count, + active_timelines_count, } } } @@ -562,6 +628,7 @@ impl Collector for TimelineCollector { self.remote_consistent_lsn.reset(); self.ps_last_received_lsn.reset(); self.feedback_last_time_seconds.reset(); + self.ps_feedback_count.reset(); self.timeline_active.reset(); self.wal_backup_active.reset(); self.connected_computes.reset(); @@ -571,8 +638,8 @@ impl Collector for TimelineCollector { self.written_wal_seconds.reset(); self.flushed_wal_seconds.reset(); - let timelines = GlobalTimelines::get_all(); - let timelines_count = timelines.len(); + let timelines_count = GlobalTimelines::get_all().len(); + let mut active_timelines_count = 0; // Prometheus Collector is sync, and data is stored under async lock. To // bridge the gap with a crutch, collect data in spawned thread with @@ -591,6 +658,10 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; + if tli.timeline_is_active { + active_timelines_count += 1; + } + self.commit_lsn .with_label_values(labels) .set(tli.mem_state.commit_lsn.into()); @@ -608,7 +679,7 @@ impl Collector for TimelineCollector { .set(tli.mem_state.peer_horizon_lsn.into()); self.remote_consistent_lsn .with_label_values(labels) - .set(tli.remote_consistent_lsn.into()); + .set(tli.mem_state.remote_consistent_lsn.into()); self.timeline_active .with_label_values(labels) .set(tli.timeline_is_active as u64); @@ -633,9 +704,12 @@ impl Collector for TimelineCollector { self.ps_last_received_lsn .with_label_values(labels) - .set(tli.ps_feedback.last_received_lsn.0); + .set(tli.last_ps_feedback.last_received_lsn.0); + self.ps_feedback_count + .with_label_values(labels) + .set(tli.ps_feedback_count); if let Ok(unix_time) = tli - .ps_feedback + .last_ps_feedback .replytime .duration_since(SystemTime::UNIX_EPOCH) { @@ -666,6 +740,7 @@ impl Collector for TimelineCollector { mfs.extend(self.remote_consistent_lsn.collect()); mfs.extend(self.ps_last_received_lsn.collect()); mfs.extend(self.feedback_last_time_seconds.collect()); + mfs.extend(self.ps_feedback_count.collect()); mfs.extend(self.timeline_active.collect()); mfs.extend(self.wal_backup_active.collect()); mfs.extend(self.connected_computes.collect()); @@ -684,15 +759,19 @@ impl Collector for TimelineCollector { self.timelines_count.set(timelines_count as i64); mfs.extend(self.timelines_count.collect()); + self.active_timelines_count + .set(active_timelines_count as i64); + mfs.extend(self.active_timelines_count.collect()); + mfs } } async fn collect_timeline_metrics() -> Vec { let mut res = vec![]; - let timelines = GlobalTimelines::get_all(); + let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all(); - for tli in timelines { + for tli in active_timelines { if let Some(info) = tli.info_for_metrics().await { res.push(info); } diff --git a/safekeeper/src/patch_control_file.rs b/safekeeper/src/patch_control_file.rs new file mode 100644 index 0000000000..2136d1b5f7 --- /dev/null +++ b/safekeeper/src/patch_control_file.rs @@ -0,0 +1,85 @@ +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use tracing::info; + +use crate::{state::TimelinePersistentState, timeline::Timeline}; + +#[derive(Deserialize, Debug, Clone)] +pub struct Request { + /// JSON object with fields to update + pub updates: serde_json::Value, + /// List of fields to apply + pub apply_fields: Vec, +} + +#[derive(Serialize)] +pub struct Response { + pub old_control_file: TimelinePersistentState, + pub new_control_file: TimelinePersistentState, +} + +/// Patch control file with given request. Will update the persistent state using +/// fields from the request and persist the new state on disk. +pub async fn handle_request(tli: Arc, request: Request) -> anyhow::Result { + let response = tli + .map_control_file(|state| { + let old_control_file = state.clone(); + let new_control_file = state_apply_diff(&old_control_file, &request)?; + + info!( + "patching control file, old: {:?}, new: {:?}, patch: {:?}", + old_control_file, new_control_file, request + ); + *state = new_control_file.clone(); + + Ok(Response { + old_control_file, + new_control_file, + }) + }) + .await?; + + Ok(response) +} + +fn state_apply_diff( + state: &TimelinePersistentState, + request: &Request, +) -> anyhow::Result { + let mut json_value = serde_json::to_value(state)?; + + if let Value::Object(a) = &mut json_value { + if let Value::Object(b) = &request.updates { + json_apply_diff(a, b, &request.apply_fields)?; + } else { + anyhow::bail!("request.updates is not a json object") + } + } else { + anyhow::bail!("TimelinePersistentState is not a json object") + } + + let new_state: TimelinePersistentState = serde_json::from_value(json_value)?; + Ok(new_state) +} + +fn json_apply_diff( + object: &mut serde_json::Map, + updates: &serde_json::Map, + apply_keys: &Vec, +) -> anyhow::Result<()> { + for key in apply_keys { + if let Some(new_value) = updates.get(key) { + if let Some(existing_value) = object.get_mut(key) { + *existing_value = new_value.clone(); + } else { + anyhow::bail!("key not found in original object: {}", key); + } + } else { + anyhow::bail!("key not found in request.updates: {}", key); + } + } + + Ok(()) +} diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 93b51f32c0..66c41f65ff 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,27 +1,244 @@ -use std::sync::Arc; - +use anyhow::{anyhow, bail, Context, Result}; +use bytes::Bytes; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use chrono::{DateTime, Utc}; +use futures::{SinkExt, StreamExt, TryStreamExt}; +use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use serde::{Deserialize, Serialize}; - -use anyhow::{bail, Context, Result}; -use tokio::io::AsyncWriteExt; -use tracing::info; -use utils::{ - id::{TenantId, TenantTimelineId, TimelineId}, - lsn::Lsn, +use std::{ + cmp::min, + io::{self, ErrorKind}, + sync::Arc, }; +use tokio::{ + fs::{File, OpenOptions}, + io::AsyncWrite, + sync::mpsc, + task, +}; +use tokio_tar::{Archive, Builder}; +use tokio_util::{ + io::{CopyToBytes, SinkWriter}, + sync::PollSender, +}; +use tracing::{error, info, instrument}; use crate::{ - control_file, debug_dump, - http::routes::TimelineStatus, - timeline::{Timeline, TimelineError}, - wal_storage::{self, Storage}, + control_file::{self, CONTROL_FILE_NAME}, + debug_dump, + http::{ + client::{self, Client}, + routes::TimelineStatus, + }, + safekeeper::Term, + timeline::{get_tenant_dir, get_timeline_dir, FullAccessTimeline, Timeline, TimelineError}, + wal_storage::{self, open_wal_file, Storage}, GlobalTimelines, SafeKeeperConf, }; +use utils::{ + crashsafe::{durable_rename, fsync_async_opt}, + id::{TenantId, TenantTimelineId, TimelineId}, + logging::SecretString, + lsn::Lsn, + pausable_failpoint, +}; -/// Info about timeline on safekeeper ready for reporting. +/// Stream tar archive of timeline to tx. +#[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] +pub async fn stream_snapshot(tli: FullAccessTimeline, tx: mpsc::Sender>) { + if let Err(e) = stream_snapshot_guts(tli, tx.clone()).await { + // Error type/contents don't matter as they won't can't reach the client + // (hyper likely doesn't do anything with it), but http stream will be + // prematurely terminated. It would be nice to try to send the error in + // trailers though. + tx.send(Err(anyhow!("snapshot failed"))).await.ok(); + error!("snapshot failed: {:#}", e); + } +} + +/// State needed while streaming the snapshot. +pub struct SnapshotContext { + pub from_segno: XLogSegNo, // including + pub upto_segno: XLogSegNo, // including + pub term: Term, + pub last_log_term: Term, + pub flush_lsn: Lsn, + pub wal_seg_size: usize, + // used to remove WAL hold off in Drop. + pub tli: FullAccessTimeline, +} + +impl Drop for SnapshotContext { + fn drop(&mut self) { + let tli = self.tli.clone(); + task::spawn(async move { + let mut shared_state = tli.write_shared_state().await; + shared_state.wal_removal_on_hold = false; + }); + } +} + +pub async fn stream_snapshot_guts( + tli: FullAccessTimeline, + tx: mpsc::Sender>, +) -> Result<()> { + // tokio-tar wants Write implementor, but we have mpsc tx >; + // use SinkWriter as a Write impl. That is, + // - create Sink from the tx. It returns PollSendError if chan is closed. + let sink = PollSender::new(tx); + // - SinkWriter needs sink error to be io one, map it. + let sink_io_err = sink.sink_map_err(|_| io::Error::from(ErrorKind::BrokenPipe)); + // - SinkWriter wants sink type to be just Bytes, not Result, so map + // it with with(). Note that with() accepts async function which we don't + // need and allows the map to fail, which we don't need either, but hence + // two Oks. + let oksink = sink_io_err.with(|b: Bytes| async { io::Result::Ok(Result::Ok(b)) }); + // - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap + // into CopyToBytes. This is a data copy. + let copy_to_bytes = CopyToBytes::new(oksink); + let mut writer = SinkWriter::new(copy_to_bytes); + let pinned_writer = std::pin::pin!(writer); + + // Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer + // which is also likely suboptimal. + let mut ar = Builder::new_non_terminated(pinned_writer); + + let bctx = tli.start_snapshot(&mut ar).await?; + pausable_failpoint!("sk-snapshot-after-list-pausable"); + + let tli_dir = tli.get_timeline_dir(); + info!( + "sending {} segments [{:#X}-{:#X}], term={}, last_log_term={}, flush_lsn={}", + bctx.upto_segno - bctx.from_segno + 1, + bctx.from_segno, + bctx.upto_segno, + bctx.term, + bctx.last_log_term, + bctx.flush_lsn, + ); + for segno in bctx.from_segno..=bctx.upto_segno { + let (mut sf, is_partial) = open_wal_file(&tli_dir, segno, bctx.wal_seg_size).await?; + let mut wal_file_name = XLogFileName(PG_TLI, segno, bctx.wal_seg_size); + if is_partial { + wal_file_name.push_str(".partial"); + } + ar.append_file(&wal_file_name, &mut sf).await?; + } + + // Do the term check before ar.finish to make archive corrupted in case of + // term change. Client shouldn't ignore abrupt stream end, but to be sure. + tli.finish_snapshot(&bctx).await?; + + ar.finish().await?; + + Ok(()) +} + +impl FullAccessTimeline { + /// Start streaming tar archive with timeline: + /// 1) stream control file under lock; + /// 2) hold off WAL removal; + /// 3) collect SnapshotContext to understand which WAL segments should be + /// streamed. + /// + /// Snapshot streams data up to flush_lsn. To make this safe, we must check + /// that term doesn't change during the procedure, or we risk sending mix of + /// WAL from different histories. Term is remembered in the SnapshotContext + /// and checked in finish_snapshot. Note that in the last segment some WAL + /// higher than flush_lsn set here might be streamed; that's fine as long as + /// terms doesn't change. + /// + /// Alternatively we could send only up to commit_lsn to get some valid + /// state which later will be recovered by compute, in this case term check + /// is not needed, but we likely don't want that as there might be no + /// compute which could perform the recovery. + /// + /// When returned SnapshotContext is dropped WAL hold is removed. + async fn start_snapshot( + &self, + ar: &mut tokio_tar::Builder, + ) -> Result { + let mut shared_state = self.write_shared_state().await; + + let cf_path = self.get_timeline_dir().join(CONTROL_FILE_NAME); + let mut cf = File::open(cf_path).await?; + ar.append_file(CONTROL_FILE_NAME, &mut cf).await?; + + // We need to stream since the oldest segment someone (s3 or pageserver) + // still needs. This duplicates calc_horizon_lsn logic. + // + // We know that WAL wasn't removed up to this point because it cannot be + // removed further than `backup_lsn`. Since we're holding shared_state + // lock and setting `wal_removal_on_hold` later, it guarantees that WAL + // won't be removed until we're done. + let from_lsn = min( + shared_state.sk.state.remote_consistent_lsn, + shared_state.sk.state.backup_lsn, + ); + if from_lsn == Lsn::INVALID { + // this is possible if snapshot is called before handling first + // elected message + bail!("snapshot is called on uninitialized timeline"); + } + let from_segno = from_lsn.segment_number(shared_state.get_wal_seg_size()); + let term = shared_state.sk.get_term(); + let last_log_term = shared_state.sk.get_last_log_term(); + let flush_lsn = shared_state.sk.flush_lsn(); + let upto_segno = flush_lsn.segment_number(shared_state.get_wal_seg_size()); + // have some limit on max number of segments as a sanity check + const MAX_ALLOWED_SEGS: u64 = 1000; + let num_segs = upto_segno - from_segno + 1; + if num_segs > MAX_ALLOWED_SEGS { + bail!( + "snapshot is called on timeline with {} segments, but the limit is {}", + num_segs, + MAX_ALLOWED_SEGS + ); + } + + // Prevent WAL removal while we're streaming data. + // + // Since this a flag, not a counter just bail out if already set; we + // shouldn't need concurrent snapshotting. + if shared_state.wal_removal_on_hold { + bail!("wal_removal_on_hold is already true"); + } + shared_state.wal_removal_on_hold = true; + + let bctx = SnapshotContext { + from_segno, + upto_segno, + term, + last_log_term, + flush_lsn, + wal_seg_size: shared_state.get_wal_seg_size(), + tli: self.clone(), + }; + + Ok(bctx) + } + + /// Finish snapshotting: check that term(s) hasn't changed. + /// + /// Note that WAL gc hold off is removed in Drop of SnapshotContext to not + /// forget this if snapshotting fails mid the way. + pub async fn finish_snapshot(&self, bctx: &SnapshotContext) -> Result<()> { + let shared_state = self.read_shared_state().await; + let term = shared_state.sk.get_term(); + let last_log_term = shared_state.sk.get_last_log_term(); + // There are some cases to relax this check (e.g. last_log_term might + // change, but as long as older history is strictly part of new that's + // fine), but there is no need to do it. + if bctx.term != term || bctx.last_log_term != last_log_term { + bail!("term(s) changed during snapshot: were term={}, last_log_term={}, now term={}, last_log_term={}", + bctx.term, bctx.last_log_term, term, last_log_term); + } + Ok(()) + } +} + +/// pull_timeline request body. #[derive(Debug, Serialize, Deserialize)] pub struct Request { pub tenant_id: TenantId, @@ -47,7 +264,10 @@ pub struct DebugDumpResponse { } /// Find the most advanced safekeeper and pull timeline from it. -pub async fn handle_request(request: Request) -> Result { +pub async fn handle_request( + request: Request, + sk_auth_token: Option, +) -> Result { let existing_tli = GlobalTimelines::get(TenantTimelineId::new( request.tenant_id, request.timeline_id, @@ -56,28 +276,26 @@ pub async fn handle_request(request: Request) -> Result { bail!("Timeline {} already exists", request.timeline_id); } - let client = reqwest::Client::new(); let http_hosts = request.http_hosts.clone(); - // Send request to /v1/tenant/:tenant_id/timeline/:timeline_id - let responses = futures::future::join_all(http_hosts.iter().map(|url| { - let url = format!( - "{}/v1/tenant/{}/timeline/{}", - url, request.tenant_id, request.timeline_id - ); - client.get(url).send() - })) - .await; + // Figure out statuses of potential donors. + let responses: Vec> = + futures::future::join_all(http_hosts.iter().map(|url| async { + let cclient = Client::new(url.clone(), sk_auth_token.clone()); + let info = cclient + .timeline_status(request.tenant_id, request.timeline_id) + .await?; + Ok(info) + })) + .await; let mut statuses = Vec::new(); for (i, response) in responses.into_iter().enumerate() { - let response = response.context(format!("Failed to get status from {}", http_hosts[i]))?; - let status: crate::http::routes::TimelineStatus = response.json().await?; + let status = response.context(format!("fetching status from {}", http_hosts[i]))?; statuses.push((status, i)); } // Find the most advanced safekeeper - // TODO: current logic may be wrong, fix it later let (status, i) = statuses .into_iter() .max_by_key(|(status, _)| { @@ -93,10 +311,14 @@ pub async fn handle_request(request: Request) -> Result { assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline(status, safekeeper_host).await + pull_timeline(status, safekeeper_host, sk_auth_token).await } -async fn pull_timeline(status: TimelineStatus, host: String) -> Result { +async fn pull_timeline( + status: TimelineStatus, + host: String, + sk_auth_token: Option, +) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}", @@ -110,86 +332,53 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result let conf = &GlobalTimelines::get_global_config(); - let client = reqwest::Client::new(); - // TODO: don't use debug dump, it should be used only in tests. - // This is a proof of concept, we should figure out a way - // to use scp without implementing it manually. - - // Implementing our own scp over HTTP. - // At first, we need to fetch list of files from safekeeper. - let dump: DebugDumpResponse = client - .get(format!( - "{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}", - host, status.tenant_id, status.timeline_id - )) - .send() - .await? - .json() - .await?; - - if dump.timelines.len() != 1 { - bail!( - "expected to fetch single timeline, got {} timelines", - dump.timelines.len() - ); - } - - let timeline = dump.timelines.into_iter().next().unwrap(); - let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!( - "timeline {} doesn't have disk content", - ttid - ))?; - - let mut filenames = disk_content - .files - .iter() - .map(|file| file.name.clone()) - .collect::>(); - - // Sort filenames to make sure we pull files in correct order - // After sorting, we should have: - // - 000000010000000000000001 - // - ... - // - 000000010000000000000002.partial - // - safekeeper.control - filenames.sort(); - - // safekeeper.control should be the first file, so we need to move it to the beginning - let control_file_index = filenames - .iter() - .position(|name| name == "safekeeper.control") - .ok_or(anyhow::anyhow!("safekeeper.control not found"))?; - filenames.remove(control_file_index); - filenames.insert(0, "safekeeper.control".to_string()); - - info!( - "downloading {} files from safekeeper {}", - filenames.len(), - host - ); - let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; - // Note: some time happens between fetching list of files and fetching files themselves. - // It's possible that some files will be removed from safekeeper and we will fail to fetch them. - // This function will fail in this case, should be retried by the caller. - for filename in filenames { - let file_path = tli_dir_path.join(&filename); - // /v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename - let http_url = format!( - "{}/v1/tenant/{}/timeline/{}/file/{}", - host, status.tenant_id, status.timeline_id, filename - ); + let client = Client::new(host.clone(), sk_auth_token.clone()); + // Request stream with basebackup archive. + let bb_resp = client + .snapshot(status.tenant_id, status.timeline_id) + .await?; - let mut file = tokio::fs::File::create(&file_path).await?; - let mut response = client.get(&http_url).send().await?; - while let Some(chunk) = response.chunk().await? { - file.write_all(&chunk).await?; - file.flush().await?; + // Make Stream of Bytes from it... + let bb_stream = bb_resp.bytes_stream().map_err(std::io::Error::other); + // and turn it into StreamReader implementing AsyncRead. + let bb_reader = tokio_util::io::StreamReader::new(bb_stream); + + // Extract it on the fly to the disk. We don't use simple unpack() to fsync + // files. + let mut entries = Archive::new(bb_reader).entries()?; + while let Some(base_tar_entry) = entries.next().await { + let mut entry = base_tar_entry?; + let header = entry.header(); + let file_path = header.path()?.into_owned(); + match header.entry_type() { + tokio_tar::EntryType::Regular => { + let utf8_file_path = + Utf8PathBuf::from_path_buf(file_path).expect("non-Unicode path"); + let dst_path = tli_dir_path.join(utf8_file_path); + let mut f = OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .open(&dst_path) + .await?; + tokio::io::copy(&mut entry, &mut f).await?; + // fsync the file + f.sync_all().await?; + } + _ => { + bail!( + "entry {} in backup tar archive is of unexpected type: {:?}", + file_path.display(), + header.entry_type() + ); + } } } - // TODO: fsync? + // fsync temp timeline directory to remember its contents. + fsync_async_opt(&tli_dir_path, !conf.no_sync).await?; // Let's create timeline from temp directory and verify that it's correct let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?; @@ -273,14 +462,16 @@ pub async fn load_temp_timeline( } // Move timeline dir to the correct location - let timeline_path = conf.timeline_dir(&ttid); + let timeline_path = get_timeline_dir(conf, &ttid); info!( "moving timeline {} from {} to {}", ttid, tmp_path, timeline_path ); - tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?; - tokio::fs::rename(tmp_path, &timeline_path).await?; + tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?; + // fsync tenant dir creation + fsync_async_opt(&conf.workdir, !conf.no_sync).await?; + durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?; let tli = GlobalTimelines::load_timeline(&guard, ttid) .await diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 9ce9b049ba..7943a2fd86 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -6,7 +6,7 @@ use crate::handler::SafekeeperPostgresHandler; use crate::safekeeper::AcceptorProposerMessage; use crate::safekeeper::ProposerAcceptorMessage; use crate::safekeeper::ServerInfo; -use crate::timeline::Timeline; +use crate::timeline::FullAccessTimeline; use crate::wal_service::ConnectionId; use crate::GlobalTimelines; use anyhow::{anyhow, Context}; @@ -36,11 +36,18 @@ use tokio::time::Instant; use tracing::*; use utils::id::TenantTimelineId; use utils::lsn::Lsn; +use utils::pageserver_feedback::PageserverFeedback; + +const DEFAULT_FEEDBACK_CAPACITY: usize = 8; /// Registry of WalReceivers (compute connections). Timeline holds it (wrapped /// in Arc). pub struct WalReceivers { mutex: Mutex, + pageserver_feedback_tx: tokio::sync::broadcast::Sender, + + num_computes_tx: tokio::sync::watch::Sender, + num_computes_rx: tokio::sync::watch::Receiver, } /// Id under which walreceiver is registered in shmem. @@ -48,15 +55,24 @@ type WalReceiverId = usize; impl WalReceivers { pub fn new() -> Arc { + let (pageserver_feedback_tx, _) = + tokio::sync::broadcast::channel(DEFAULT_FEEDBACK_CAPACITY); + + let (num_computes_tx, num_computes_rx) = tokio::sync::watch::channel(0usize); + Arc::new(WalReceivers { mutex: Mutex::new(WalReceiversShared { slots: Vec::new() }), + pageserver_feedback_tx, + num_computes_tx, + num_computes_rx, }) } /// Register new walreceiver. Returned guard provides access to the slot and /// automatically deregisters in Drop. pub fn register(self: &Arc, conn_id: Option) -> WalReceiverGuard { - let slots = &mut self.mutex.lock().slots; + let mut shared = self.mutex.lock(); + let slots = &mut shared.slots; let walreceiver = WalReceiverState { conn_id, status: WalReceiverStatus::Voting, @@ -70,6 +86,9 @@ impl WalReceivers { slots.push(Some(walreceiver)); pos }; + + self.update_num(&shared); + WalReceiverGuard { id: pos, walreceivers: self.clone(), @@ -91,7 +110,18 @@ impl WalReceivers { /// Get number of walreceivers (compute connections). pub fn get_num(self: &Arc) -> usize { - self.mutex.lock().slots.iter().flatten().count() + self.mutex.lock().get_num() + } + + /// Get channel for number of walreceivers. + pub fn get_num_rx(self: &Arc) -> tokio::sync::watch::Receiver { + self.num_computes_rx.clone() + } + + /// Should get called after every update of slots. + fn update_num(self: &Arc, shared: &MutexGuard) { + let num = shared.get_num(); + self.num_computes_tx.send_replace(num); } /// Get state of all walreceivers. @@ -115,6 +145,13 @@ impl WalReceivers { fn unregister(self: &Arc, id: WalReceiverId) { let mut shared = self.mutex.lock(); shared.slots[id] = None; + self.update_num(&shared); + } + + /// Broadcast pageserver feedback to connected walproposers. + pub fn broadcast_pageserver_feedback(&self, feedback: PageserverFeedback) { + // Err means there is no subscribers, it is fine. + let _ = self.pageserver_feedback_tx.send(feedback); } } @@ -123,6 +160,13 @@ struct WalReceiversShared { slots: Vec>, } +impl WalReceiversShared { + /// Get number of walreceivers (compute connections). + fn get_num(&self) -> usize { + self.slots.iter().flatten().count() + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct WalReceiverState { /// None means it is recovery initiated by us (this safekeeper). @@ -169,9 +213,19 @@ impl SafekeeperPostgresHandler { &mut self, pgb: &mut PostgresBackend, ) -> Result<(), QueryError> { - if let Err(end) = self.handle_start_wal_push_guts(pgb).await { + let mut tli: Option = None; + if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await { // Log the result and probably send it to the client, closing the stream. - pgb.handle_copy_stream_end(end).await; + let handle_end_fut = pgb.handle_copy_stream_end(end); + // If we managed to create the timeline, augment logging with current LSNs etc. + if let Some(tli) = tli { + let info = tli.get_safekeeper_info(&self.conf).await; + handle_end_fut + .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn))) + .await; + } else { + handle_end_fut.await; + } } Ok(()) } @@ -179,6 +233,7 @@ impl SafekeeperPostgresHandler { pub async fn handle_start_wal_push_guts( &mut self, pgb: &mut PostgresBackend, + tli: &mut Option, ) -> Result<(), CopyStreamHandlerEnd> { // Notify the libpq client that it's allowed to send `CopyData` messages pgb.write_message(&BeMessage::CopyBothResponse).await?; @@ -197,17 +252,32 @@ impl SafekeeperPostgresHandler { // sends, so this avoids deadlocks. let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?; let peer_addr = *pgb.get_peer_addr(); - let network_reader = NetworkReader { + let mut network_reader = NetworkReader { ttid: self.ttid, conn_id: self.conn_id, pgb_reader: &mut pgb_reader, peer_addr, acceptor_handle: &mut acceptor_handle, }; - let res = tokio::select! { - // todo: add read|write .context to these errors - r = network_reader.run(msg_tx, msg_rx, reply_tx) => r, - r = network_write(pgb, reply_rx) => r, + + // Read first message and create timeline if needed. + let res = network_reader.read_first_message().await; + + let network_res = if let Ok((timeline, next_msg)) = res { + let pageserver_feedback_rx: tokio::sync::broadcast::Receiver = + timeline + .get_walreceivers() + .pageserver_feedback_tx + .subscribe(); + *tli = Some(timeline.clone()); + + tokio::select! { + // todo: add read|write .context to these errors + r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r, + r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r, + } + } else { + res.map(|_| ()) }; // Join pg backend back. @@ -219,13 +289,13 @@ impl SafekeeperPostgresHandler { match acceptor_handle { None => { // failed even before spawning; read_network should have error - Err(res.expect_err("no error with WalAcceptor not spawn")) + Err(network_res.expect_err("no error with WalAcceptor not spawn")) } Some(handle) => { let wal_acceptor_res = handle.await; // If there was any network error, return it. - res?; + network_res?; // Otherwise, WalAcceptor thread must have errored. match wal_acceptor_res { @@ -251,12 +321,9 @@ struct NetworkReader<'a, IO> { } impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { - async fn run( - self, - msg_tx: Sender, - msg_rx: Receiver, - reply_tx: Sender, - ) -> Result<(), CopyStreamHandlerEnd> { + async fn read_first_message( + &mut self, + ) -> Result<(FullAccessTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> { // Receive information about server to create timeline, if not yet. let next_msg = read_message(self.pgb_reader).await?; let tli = match next_msg { @@ -270,7 +337,10 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { system_id: greeting.system_id, wal_seg_size: greeting.wal_seg_size, }; - GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await? + let tli = + GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID) + .await?; + tli.full_access_guard().await? } _ => { return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!( @@ -278,9 +348,19 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> { ))) } }; + Ok((tli, next_msg)) + } + async fn run( + self, + msg_tx: Sender, + msg_rx: Receiver, + reply_tx: Sender, + tli: FullAccessTimeline, + next_msg: ProposerAcceptorMessage, + ) -> Result<(), CopyStreamHandlerEnd> { *self.acceptor_handle = Some(WalAcceptor::spawn( - tli.clone(), + tli, msg_rx, reply_tx, Some(self.conn_id), @@ -320,18 +400,46 @@ async fn read_network_loop( async fn network_write( pgb_writer: &mut PostgresBackend, mut reply_rx: Receiver, + mut pageserver_feedback_rx: tokio::sync::broadcast::Receiver, ) -> Result<(), CopyStreamHandlerEnd> { let mut buf = BytesMut::with_capacity(128); + // storing append_response to inject PageserverFeedback into it + let mut last_append_response = None; + loop { - match reply_rx.recv().await { - Some(msg) => { - buf.clear(); - msg.serialize(&mut buf)?; - pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; + // trying to read either AcceptorProposerMessage or PageserverFeedback + let msg = tokio::select! { + reply = reply_rx.recv() => { + if let Some(msg) = reply { + if let AcceptorProposerMessage::AppendResponse(append_response) = &msg { + last_append_response = Some(append_response.clone()); + } + Some(msg) + } else { + return Ok(()); // chan closed, WalAcceptor terminated + } } - None => return Ok(()), // chan closed, WalAcceptor terminated - } + + feedback = pageserver_feedback_rx.recv() => + match (feedback, &last_append_response) { + (Ok(feedback), Some(append_response)) => { + // clone AppendResponse and inject PageserverFeedback into it + let mut append_response = append_response.clone(); + append_response.pageserver_feedback = Some(feedback); + Some(AcceptorProposerMessage::AppendResponse(append_response)) + } + _ => None, + } + }; + + let Some(msg) = msg else { + continue; + }; + + buf.clear(); + msg.serialize(&mut buf)?; + pgb_writer.write_message(&BeMessage::CopyData(&buf)).await?; } } @@ -343,7 +451,7 @@ const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1); /// replies to reply_tx; reading from socket and writing to disk in parallel is /// beneficial for performance, this struct provides writing to disk part. pub struct WalAcceptor { - tli: Arc, + tli: FullAccessTimeline, msg_rx: Receiver, reply_tx: Sender, conn_id: Option, @@ -356,7 +464,7 @@ impl WalAcceptor { /// /// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper. pub fn spawn( - tli: Arc, + tli: FullAccessTimeline, msg_rx: Receiver, reply_tx: Sender, conn_id: Option, @@ -381,14 +489,7 @@ impl WalAcceptor { /// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed; /// it must mean that network thread terminated. async fn run(&mut self) -> anyhow::Result<()> { - // Register the connection and defer unregister. - // Order of the next two lines is important: we want first to remove our entry and then - // update status which depends on registered connections. - let _compute_conn_guard = ComputeConnectionGuard { - timeline: Arc::clone(&self.tli), - }; let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id); - self.tli.update_status_notify().await?; // After this timestamp we will stop processing AppendRequests and send a response // to the walproposer. walproposer sends at least one AppendRequest per second, @@ -454,19 +555,3 @@ impl WalAcceptor { } } } - -/// Calls update_status_notify in drop to update timeline status. -struct ComputeConnectionGuard { - timeline: Arc, -} - -impl Drop for ComputeConnectionGuard { - fn drop(&mut self) { - let tli = self.timeline.clone(); - tokio::spawn(async move { - if let Err(e) = tli.update_status_notify().await { - error!("failed to update timeline status: {}", e); - } - }); - } -} diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index e8fa6c55f4..80a630b1e1 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -2,7 +2,7 @@ //! provide it, i.e. safekeeper lags too much. use std::time::SystemTime; -use std::{fmt, pin::pin, sync::Arc}; +use std::{fmt, pin::pin}; use anyhow::{bail, Context}; use futures::StreamExt; @@ -21,6 +21,7 @@ use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config} use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE}; use crate::safekeeper::{AppendRequest, AppendRequestHeader}; +use crate::timeline::FullAccessTimeline; use crate::{ http::routes::TimelineStatus, receive_wal::MSG_QUEUE_SIZE, @@ -28,31 +29,106 @@ use crate::{ AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory, TermLsn, VoteRequest, }, - timeline::{PeerInfo, Timeline}, + timeline::PeerInfo, SafeKeeperConf, }; /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. #[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))] -pub async fn recovery_main(tli: Arc, conf: SafeKeeperConf) { +pub async fn recovery_main(tli: FullAccessTimeline, conf: SafeKeeperConf) { info!("started"); - let mut cancellation_rx = match tli.get_cancellation_rx() { - Ok(rx) => rx, - Err(_) => { - info!("timeline canceled during task start"); - return; - } - }; + let cancel = tli.cancel.clone(); select! { _ = recovery_main_loop(tli, conf) => { unreachable!() } - _ = cancellation_rx.changed() => { + _ = cancel.cancelled() => { info!("stopped"); } } } +/// Should we start fetching WAL from a peer safekeeper, and if yes, from +/// which? Answer is yes, i.e. .donors is not empty if 1) there is something +/// to fetch, and we can do that without running elections; 2) there is no +/// actively streaming compute, as we don't want to compete with it. +/// +/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal +/// to its last_log_term so we are sure such a leader ever had been elected. +/// +/// All possible donors are returned so that we could keep connection to the +/// current one if it is good even if it slightly lags behind. +/// +/// Note that term conditions above might be not met, but safekeepers are +/// still not aligned on last flush_lsn. Generally in this case until +/// elections are run it is not possible to say which safekeeper should +/// recover from which one -- history which would be committed is different +/// depending on assembled quorum (e.g. classic picture 8 from Raft paper). +/// Thus we don't try to predict it here. +async fn recovery_needed( + tli: &FullAccessTimeline, + heartbeat_timeout: Duration, +) -> RecoveryNeededInfo { + let ss = tli.read_shared_state().await; + let term = ss.sk.state.acceptor_state.term; + let last_log_term = ss.sk.get_last_log_term(); + let flush_lsn = ss.sk.flush_lsn(); + // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us. + let mut peers = ss.get_peers(heartbeat_timeout); + // Sort by pairs. + peers.sort_by(|p1, p2| { + let tl1 = TermLsn { + term: p1.last_log_term, + lsn: p1.flush_lsn, + }; + let tl2 = TermLsn { + term: p2.last_log_term, + lsn: p2.flush_lsn, + }; + tl2.cmp(&tl1) // desc + }); + let num_streaming_computes = tli.get_walreceivers().get_num_streaming(); + let donors = if num_streaming_computes > 0 { + vec![] // If there is a streaming compute, don't try to recover to not intervene. + } else { + peers + .iter() + .filter_map(|candidate| { + // Are we interested in this candidate? + let candidate_tl = TermLsn { + term: candidate.last_log_term, + lsn: candidate.flush_lsn, + }; + let my_tl = TermLsn { + term: last_log_term, + lsn: flush_lsn, + }; + if my_tl < candidate_tl { + // Yes, we are interested. Can we pull from it without + // (re)running elections? It is possible if 1) his term + // is equal to his last_log_term so we could act on + // behalf of leader of this term (we must be sure he was + // ever elected) and 2) our term is not higher, or we'll refuse data. + if candidate.term == candidate.last_log_term && candidate.term >= term { + Some(Donor::from(candidate)) + } else { + None + } + } else { + None + } + }) + .collect() + }; + RecoveryNeededInfo { + term, + last_log_term, + flush_lsn, + peers, + num_streaming_computes, + donors, + } +} /// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and /// fields to explain the choice. #[derive(Debug)] @@ -119,10 +195,10 @@ impl From<&PeerInfo> for Donor { const CHECK_INTERVAL_MS: u64 = 2000; /// Check regularly whether we need to start recovery. -async fn recovery_main_loop(tli: Arc, conf: SafeKeeperConf) { +async fn recovery_main_loop(tli: FullAccessTimeline, conf: SafeKeeperConf) { let check_duration = Duration::from_millis(CHECK_INTERVAL_MS); loop { - let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await; + let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await; match recovery_needed_info.donors.first() { Some(donor) => { info!( @@ -152,7 +228,7 @@ async fn recovery_main_loop(tli: Arc, conf: SafeKeeperConf) { /// Recover from the specified donor. Returns message explaining normal finish /// reason or error. async fn recover( - tli: Arc, + tli: FullAccessTimeline, donor: &Donor, conf: &SafeKeeperConf, ) -> anyhow::Result { @@ -238,7 +314,7 @@ async fn recover( // Pull WAL from donor, assuming handshake is already done. async fn recovery_stream( - tli: Arc, + tli: FullAccessTimeline, donor: &Donor, start_streaming_at: Lsn, conf: &SafeKeeperConf, @@ -322,7 +398,7 @@ async fn network_io( physical_stream: ReplicationStream, msg_tx: Sender, donor: Donor, - tli: Arc, + tli: FullAccessTimeline, conf: SafeKeeperConf, ) -> anyhow::Result> { let mut physical_stream = pin!(physical_stream); @@ -343,7 +419,7 @@ async fn network_io( ReplicationMessage::XLogData(xlog_data) => { let ar_hdr = AppendRequestHeader { term: donor.term, - epoch_start_lsn: Lsn::INVALID, // unused + term_start_lsn: Lsn::INVALID, // unused begin_lsn: Lsn(xlog_data.wal_start()), end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64, commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it @@ -371,7 +447,7 @@ async fn network_io( } ReplicationMessage::PrimaryKeepAlive(_) => { // keepalive means nothing is being streamed for a while. Check whether we need to stop. - let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await; + let recovery_needed_info = recovery_needed(&tli, conf.heartbeat_timeout).await; // do current donors still contain one we currently connected to? if !recovery_needed_info .donors diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index d96eedf401..b661e48cb5 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -1,32 +1,25 @@ -//! Thread removing old WAL. +use utils::lsn::Lsn; -use std::time::Duration; +use crate::timeline_manager::StateSnapshot; -use tokio::time::sleep; -use tracing::*; +/// Get oldest LSN we still need to keep. We hold WAL till it is consumed +/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 +/// offloading. +/// While it is safe to use inmem values for determining horizon, +/// we use persistent to make possible normal states less surprising. +/// All segments covering LSNs before horizon_lsn can be removed. +pub fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option) -> Lsn { + use std::cmp::min; -use crate::{GlobalTimelines, SafeKeeperConf}; - -pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> { - let wal_removal_interval = Duration::from_millis(5000); - loop { - let tlis = GlobalTimelines::get_all(); - for tli in &tlis { - if !tli.is_active().await { - continue; - } - let ttid = tli.ttid; - async { - if let Err(e) = tli.maybe_persist_control_file().await { - warn!("failed to persist control file: {e}"); - } - if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled).await { - error!("failed to remove WAL: {}", e); - } - } - .instrument(info_span!("WAL removal", ttid = %ttid)) - .await; - } - sleep(wal_removal_interval).await; + let mut horizon_lsn = min( + state.cfile_remote_consistent_lsn, + state.cfile_peer_horizon_lsn, + ); + // we don't want to remove WAL that is not yet offloaded to s3 + horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn); + if let Some(extra_horizon_lsn) = extra_horizon_lsn { + horizon_lsn = min(horizon_lsn, extra_horizon_lsn); } + + horizon_lsn } diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 217a5f89ee..ae230960ae 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -4,13 +4,12 @@ use anyhow::{bail, Context, Result}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; -use postgres_ffi::{TimeLineID, XLogSegNo, MAX_SEND_SIZE}; +use postgres_ffi::{TimeLineID, MAX_SEND_SIZE}; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::cmp::min; use std::fmt; use std::io::Read; -use std::time::Duration; use storage_broker::proto::SafekeeperTimelineInfo; use tracing::*; @@ -18,17 +17,16 @@ use tracing::*; use crate::control_file; use crate::send_wal::HotStandbyFeedback; +use crate::state::TimelineState; use crate::wal_storage; use pq_proto::SystemId; use utils::pageserver_feedback::PageserverFeedback; use utils::{ bin_ser::LeSer, - id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + id::{NodeId, TenantId, TimelineId}, lsn::Lsn, }; -pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 7; const SK_PROTOCOL_VERSION: u32 = 2; pub const UNKNOWN_SERVER_VERSION: u32 = 0; @@ -189,8 +187,8 @@ pub struct AcceptorState { } impl AcceptorState { - /// acceptor's epoch is the term of the highest entry in the log - pub fn get_epoch(&self, flush_lsn: Lsn) -> Term { + /// acceptor's last_log_term is the term of the highest entry in the log + pub fn get_last_log_term(&self, flush_lsn: Lsn) -> Term { let th = self.term_history.up_to(flush_lsn); match th.0.last() { Some(e) => e.term, @@ -222,7 +220,7 @@ pub struct PersistedPeerInfo { } impl PersistedPeerInfo { - fn new() -> Self { + pub fn new() -> Self { Self { backup_lsn: Lsn::INVALID, term: INVALID_TERM, @@ -232,111 +230,10 @@ impl PersistedPeerInfo { } } -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); - -/// Persistent information stored on safekeeper node -/// On disk data is prefixed by magic and format version and followed by checksum. -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct SafeKeeperState { - #[serde(with = "hex")] - pub tenant_id: TenantId, - #[serde(with = "hex")] - pub timeline_id: TimelineId, - /// persistent acceptor state - pub acceptor_state: AcceptorState, - /// information about server - pub server: ServerInfo, - /// Unique id of the last *elected* proposer we dealt with. Not needed - /// for correctness, exists for monitoring purposes. - #[serde(with = "hex")] - pub proposer_uuid: PgUuid, - /// Since which LSN this timeline generally starts. Safekeeper might have - /// joined later. - pub timeline_start_lsn: Lsn, - /// Since which LSN safekeeper has (had) WAL for this timeline. - /// All WAL segments next to one containing local_start_lsn are - /// filled with data from the beginning. - pub local_start_lsn: Lsn, - /// Part of WAL acknowledged by quorum *and available locally*. Always points - /// to record boundary. - pub commit_lsn: Lsn, - /// LSN that points to the end of the last backed up segment. Useful to - /// persist to avoid finding out offloading progress on boot. - pub backup_lsn: Lsn, - /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn - /// of last record streamed to everyone). Persisting it helps skipping - /// recovery in walproposer, generally we compute it from peers. In - /// walproposer proto called 'truncate_lsn'. Updates are currently drived - /// only by walproposer. - pub peer_horizon_lsn: Lsn, - /// LSN of the oldest known checkpoint made by pageserver and successfully - /// pushed to s3. We don't remove WAL beyond it. Persisted only for - /// informational purposes, we receive it from pageserver (or broker). - pub remote_consistent_lsn: Lsn, - // Peers and their state as we remember it. Knowing peers themselves is - // fundamental; but state is saved here only for informational purposes and - // obviously can be stale. (Currently not saved at all, but let's provision - // place to have less file version upgrades). - pub peers: PersistedPeers, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -// In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; values -// are not flushed yet. -pub struct SafekeeperMemState { - pub commit_lsn: Lsn, - pub backup_lsn: Lsn, - pub peer_horizon_lsn: Lsn, - #[serde(with = "hex")] - pub proposer_uuid: PgUuid, -} - -impl SafeKeeperState { - pub fn new( - ttid: &TenantTimelineId, - server_info: ServerInfo, - peers: Vec, - commit_lsn: Lsn, - local_start_lsn: Lsn, - ) -> SafeKeeperState { - SafeKeeperState { - tenant_id: ttid.tenant_id, - timeline_id: ttid.timeline_id, - acceptor_state: AcceptorState { - term: 0, - term_history: TermHistory::empty(), - }, - server: server_info, - proposer_uuid: [0; 16], - timeline_start_lsn: Lsn(0), - local_start_lsn, - commit_lsn, - backup_lsn: local_start_lsn, - peer_horizon_lsn: local_start_lsn, - remote_consistent_lsn: Lsn(0), - peers: PersistedPeers( - peers - .iter() - .map(|p| (*p, PersistedPeerInfo::new())) - .collect(), - ), - } - } - - #[cfg(test)] - pub fn empty() -> Self { - SafeKeeperState::new( - &TenantTimelineId::empty(), - ServerInfo { - pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ - system_id: 0, /* Postgres system identifier */ - wal_seg_size: 0, - }, - vec![], - Lsn::INVALID, - Lsn::INVALID, - ) +// make clippy happy +impl Default for PersistedPeerInfo { + fn default() -> Self { + Self::new() } } @@ -407,9 +304,9 @@ pub struct AppendRequest { pub struct AppendRequestHeader { // safekeeper's current term; if it is higher than proposer's, the compute is out of date. pub term: Term, - // TODO: remove this field, it in unused -- LSN of term switch can be taken - // from ProposerElected (as well as from term history). - pub epoch_start_lsn: Lsn, + // TODO: remove this field from the protocol, it in unused -- LSN of term + // switch can be taken from ProposerElected (as well as from term history). + pub term_start_lsn: Lsn, /// start position of message in WAL pub begin_lsn: Lsn, /// end position of message in WAL @@ -423,20 +320,21 @@ pub struct AppendRequestHeader { } /// Report safekeeper state to proposer -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub struct AppendResponse { // Current term of the safekeeper; if it is higher than proposer's, the // compute is out of date. pub term: Term, - // NOTE: this is physical end of wal on safekeeper; currently it doesn't - // make much sense without taking epoch into account, as history can be - // diverged. + // Flushed end of wal on safekeeper; one should be always mindful from what + // term history this value comes, either checking history directly or + // observing term being set to one for which WAL truncation is known to have + // happened. pub flush_lsn: Lsn, // We report back our awareness about which WAL is committed, as this is // a criterion for walproposer --sync mode exit pub commit_lsn: Lsn, pub hs_feedback: HotStandbyFeedback, - pub pageserver_feedback: PageserverFeedback, + pub pageserver_feedback: Option, } impl AppendResponse { @@ -446,7 +344,7 @@ impl AppendResponse { flush_lsn: Lsn(0), commit_lsn: Lsn(0), hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: PageserverFeedback::empty(), + pageserver_feedback: None, } } } @@ -564,7 +462,11 @@ impl AcceptorProposerMessage { buf.put_u64_le(msg.hs_feedback.xmin); buf.put_u64_le(msg.hs_feedback.catalog_xmin); - msg.pageserver_feedback.serialize(buf); + // AsyncReadMessage in walproposer.c will not try to decode pageserver_feedback + // if it is not present. + if let Some(ref msg) = msg.pageserver_feedback { + msg.serialize(buf); + } } } @@ -580,12 +482,10 @@ impl AcceptorProposerMessage { /// - messages from broker peers pub struct SafeKeeper { /// LSN since the proposer safekeeper currently talking to appends WAL; - /// determines epoch switch point. - pub epoch_start_lsn: Lsn, - - pub inmem: SafekeeperMemState, // in memory part - pub state: CTRL, // persistent state storage + /// determines last_log_term switch point. + pub term_start_lsn: Lsn, + pub state: TimelineState, // persistent state storage pub wal_store: WAL, node_id: NodeId, // safekeeper's node id @@ -611,14 +511,8 @@ where } Ok(SafeKeeper { - epoch_start_lsn: Lsn(0), - inmem: SafekeeperMemState { - commit_lsn: state.commit_lsn, - backup_lsn: state.backup_lsn, - peer_horizon_lsn: state.peer_horizon_lsn, - proposer_uuid: state.proposer_uuid, - }, - state, + term_start_lsn: Lsn(0), + state: TimelineState::new(state), wal_store, node_id, }) @@ -637,8 +531,10 @@ where self.state.acceptor_state.term } - pub fn get_epoch(&self) -> Term { - self.state.acceptor_state.get_epoch(self.flush_lsn()) + pub fn get_last_log_term(&self) -> Term { + self.state + .acceptor_state + .get_last_log_term(self.flush_lsn()) } /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet. @@ -726,12 +622,12 @@ where ); } - let mut state = self.state.clone(); + let mut state = self.state.start_change(); state.server.system_id = msg.system_id; if msg.pg_version != UNKNOWN_SERVER_VERSION { state.server.pg_version = msg.pg_version; } - self.state.persist(&state).await?; + self.state.finish_change(&state).await?; } info!( @@ -766,15 +662,15 @@ where term: self.state.acceptor_state.term, vote_given: false as u64, flush_lsn: self.flush_lsn(), - truncate_lsn: self.inmem.peer_horizon_lsn, + truncate_lsn: self.state.inmem.peer_horizon_lsn, term_history: self.get_term_history(), timeline_start_lsn: self.state.timeline_start_lsn, }; if self.state.acceptor_state.term < msg.term { - let mut state = self.state.clone(); + let mut state = self.state.start_change(); state.acceptor_state.term = msg.term; // persist vote before sending it out - self.state.persist(&state).await?; + self.state.finish_change(&state).await?; resp.term = self.state.acceptor_state.term; resp.vote_given = true as u64; @@ -791,7 +687,7 @@ where commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: PageserverFeedback::empty(), + pageserver_feedback: None, }; trace!("formed AppendResponse {:?}", ar); ar @@ -803,9 +699,9 @@ where ) -> Result> { info!("received ProposerElected {:?}", msg); if self.state.acceptor_state.term < msg.term { - let mut state = self.state.clone(); + let mut state = self.state.start_change(); state.acceptor_state.term = msg.term; - self.state.persist(&state).await?; + self.state.finish_change(&state).await?; } // If our term is higher, ignore the message (next feedback will inform the compute) @@ -819,18 +715,30 @@ where // proceed, but to prevent commit_lsn surprisingly going down we should // either refuse the session (simpler) or skip the part we already have // from the stream (can be implemented). - if msg.term == self.get_epoch() && self.flush_lsn() > msg.start_streaming_at { + if msg.term == self.get_last_log_term() && self.flush_lsn() > msg.start_streaming_at { bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help", msg.term, self.flush_lsn(), msg.start_streaming_at) } // Otherwise we must never attempt to truncate committed data. assert!( - msg.start_streaming_at >= self.inmem.commit_lsn, + msg.start_streaming_at >= self.state.inmem.commit_lsn, "attempt to truncate committed data: start_streaming_at={}, commit_lsn={}", msg.start_streaming_at, - self.inmem.commit_lsn + self.state.inmem.commit_lsn ); + // Before first WAL write initialize its segment. It makes first segment + // pg_waldump'able because stream from compute doesn't include its + // segment and page headers. + // + // If we fail before first WAL write flush this action would be + // repeated, that's ok because it is idempotent. + if self.wal_store.flush_lsn() == Lsn::INVALID { + self.wal_store + .initialize_first_segment(msg.start_streaming_at) + .await?; + } + // TODO: cross check divergence point, check if msg.start_streaming_at corresponds to // intersection of our history and history from msg @@ -839,7 +747,7 @@ where // and now adopt term history from proposer { - let mut state = self.state.clone(); + let mut state = self.state.start_change(); // Here we learn initial LSN for the first time, set fields // interested in that. @@ -852,6 +760,11 @@ where state.timeline_start_lsn ); } + if state.peer_horizon_lsn == Lsn(0) { + // Update peer_horizon_lsn as soon as we know where timeline starts. + // It means that peer_horizon_lsn cannot be zero after we know timeline_start_lsn. + state.peer_horizon_lsn = msg.timeline_start_lsn; + } if state.local_start_lsn == Lsn(0) { state.local_start_lsn = msg.start_streaming_at; info!("setting local_start_lsn to {:?}", state.local_start_lsn); @@ -863,13 +776,16 @@ where // NB: on new clusters, this happens at the same time as // timeline_start_lsn initialization, it is taken outside to provide // upgrade. - self.inmem.commit_lsn = max(self.inmem.commit_lsn, state.timeline_start_lsn); + state.commit_lsn = max(state.commit_lsn, state.timeline_start_lsn); // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment. - self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn); + state.backup_lsn = max(state.backup_lsn, state.timeline_start_lsn); + // similar for remote_consistent_lsn + state.remote_consistent_lsn = + max(state.remote_consistent_lsn, state.timeline_start_lsn); state.acceptor_state.term_history = msg.term_history.clone(); - self.persist_control_file(state).await?; + self.state.finish_change(&state).await?; } info!("start receiving WAL since {:?}", msg.start_streaming_at); @@ -877,7 +793,7 @@ where // Cache LSN where term starts to immediately fsync control file with // commit_lsn once we reach it -- sync-safekeepers finishes when // persisted commit_lsn on majority of safekeepers aligns. - self.epoch_start_lsn = match msg.term_history.0.last() { + self.term_start_lsn = match msg.term_history.0.last() { None => bail!("proposer elected with empty term history"), Some(term_lsn_start) => term_lsn_start.lsn, }; @@ -892,68 +808,28 @@ where async fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> { // Both peers and walproposer communicate this value, we might already // have a fresher (higher) version. - candidate = max(candidate, self.inmem.commit_lsn); + candidate = max(candidate, self.state.inmem.commit_lsn); let commit_lsn = min(candidate, self.flush_lsn()); assert!( - commit_lsn >= self.inmem.commit_lsn, + commit_lsn >= self.state.inmem.commit_lsn, "commit_lsn monotonicity violated: old={} new={}", - self.inmem.commit_lsn, + self.state.inmem.commit_lsn, commit_lsn ); - self.inmem.commit_lsn = commit_lsn; + self.state.inmem.commit_lsn = commit_lsn; - // If new commit_lsn reached epoch switch, force sync of control + // If new commit_lsn reached term switch, force sync of control // file: walproposer in sync mode is very interested when this // happens. Note: this is for sync-safekeepers mode only, as - // otherwise commit_lsn might jump over epoch_start_lsn. - if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn { - self.persist_control_file(self.state.clone()).await?; + // otherwise commit_lsn might jump over term_start_lsn. + if commit_lsn >= self.term_start_lsn && self.state.commit_lsn < self.term_start_lsn { + self.state.flush().await?; } Ok(()) } - /// Persist in-memory state of control file to disk. - // - // TODO: passing inmem_remote_consistent_lsn everywhere is ugly, better - // separate state completely and give Arc to all those who need it. - pub async fn persist_inmem(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> { - let mut state = self.state.clone(); - state.remote_consistent_lsn = inmem_remote_consistent_lsn; - self.persist_control_file(state).await - } - - /// Persist in-memory state to the disk, taking other data from state. - async fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> { - state.commit_lsn = self.inmem.commit_lsn; - state.backup_lsn = self.inmem.backup_lsn; - state.peer_horizon_lsn = self.inmem.peer_horizon_lsn; - state.proposer_uuid = self.inmem.proposer_uuid; - self.state.persist(&state).await - } - - /// Persist control file if there is something to save and enough time - /// passed after the last save. - pub async fn maybe_persist_inmem_control_file( - &mut self, - inmem_remote_consistent_lsn: Lsn, - ) -> Result<()> { - const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300); - if self.state.last_persist_at().elapsed() < CF_SAVE_INTERVAL { - return Ok(()); - } - let need_persist = self.inmem.commit_lsn > self.state.commit_lsn - || self.inmem.backup_lsn > self.state.backup_lsn - || self.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn - || inmem_remote_consistent_lsn > self.state.remote_consistent_lsn; - if need_persist { - self.persist_inmem(inmem_remote_consistent_lsn).await?; - trace!("saved control file: {CF_SAVE_INTERVAL:?} passed"); - } - Ok(()) - } - /// Handle request to append WAL. #[allow(clippy::comparison_chain)] async fn handle_append_request( @@ -974,7 +850,7 @@ where // Now we know that we are in the same term as the proposer, // processing the message. - self.inmem.proposer_uuid = msg.h.proposer_uuid; + self.state.inmem.proposer_uuid = msg.h.proposer_uuid; // do the job if !msg.wal_data.is_empty() { @@ -998,15 +874,16 @@ where // - if we make safekeepers always send persistent value, // any compute restart would pull it down. // Thus, take max before adopting. - self.inmem.peer_horizon_lsn = max(self.inmem.peer_horizon_lsn, msg.h.truncate_lsn); + self.state.inmem.peer_horizon_lsn = + max(self.state.inmem.peer_horizon_lsn, msg.h.truncate_lsn); // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only - // when truncate_lsn delta exceeds WAL segment size. - if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) - < self.inmem.peer_horizon_lsn + // when commit_lsn delta exceeds WAL segment size. + if self.state.commit_lsn + (self.state.server.wal_seg_size as u64) + < self.state.inmem.commit_lsn { - self.persist_control_file(self.state.clone()).await?; + self.state.flush().await?; } trace!( @@ -1043,70 +920,57 @@ where // Note: the check is too restrictive, generally we can update local // commit_lsn if our history matches (is part of) history of advanced // commit_lsn provider. - if sk_info.last_log_term == self.get_epoch() { + if sk_info.last_log_term == self.get_last_log_term() { self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?; } } - let new_backup_lsn = max(Lsn(sk_info.backup_lsn), self.inmem.backup_lsn); - sync_control_file |= - self.state.backup_lsn + (self.state.server.wal_seg_size as u64) < new_backup_lsn; - self.inmem.backup_lsn = new_backup_lsn; + self.state.inmem.backup_lsn = max(Lsn(sk_info.backup_lsn), self.state.inmem.backup_lsn); + sync_control_file |= self.state.backup_lsn + (self.state.server.wal_seg_size as u64) + < self.state.inmem.backup_lsn; - // value in sk_info should be maximized over our local in memory value. - let new_remote_consistent_lsn = Lsn(sk_info.remote_consistent_lsn); - assert!(self.state.remote_consistent_lsn <= new_remote_consistent_lsn); + self.state.inmem.remote_consistent_lsn = max( + Lsn(sk_info.remote_consistent_lsn), + self.state.inmem.remote_consistent_lsn, + ); sync_control_file |= self.state.remote_consistent_lsn + (self.state.server.wal_seg_size as u64) - < new_remote_consistent_lsn; + < self.state.inmem.remote_consistent_lsn; - let new_peer_horizon_lsn = max(Lsn(sk_info.peer_horizon_lsn), self.inmem.peer_horizon_lsn); + self.state.inmem.peer_horizon_lsn = max( + Lsn(sk_info.peer_horizon_lsn), + self.state.inmem.peer_horizon_lsn, + ); sync_control_file |= self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64) - < new_peer_horizon_lsn; - self.inmem.peer_horizon_lsn = new_peer_horizon_lsn; + < self.state.inmem.peer_horizon_lsn; if sync_control_file { - let mut state = self.state.clone(); - state.remote_consistent_lsn = new_remote_consistent_lsn; - self.persist_control_file(state).await?; + self.state.flush().await?; } Ok(()) } - - /// Get oldest segno we still need to keep. We hold WAL till it is consumed - /// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 - /// offloading. - /// While it is safe to use inmem values for determining horizon, - /// we use persistent to make possible normal states less surprising. - pub fn get_horizon_segno(&self, wal_backup_enabled: bool) -> XLogSegNo { - let mut horizon_lsn = min( - self.state.remote_consistent_lsn, - self.state.peer_horizon_lsn, - ); - if wal_backup_enabled { - horizon_lsn = min(horizon_lsn, self.state.backup_lsn); - } - horizon_lsn.segment_number(self.state.server.wal_seg_size as usize) - } } #[cfg(test)] mod tests { use futures::future::BoxFuture; - use postgres_ffi::WAL_SEGMENT_SIZE; + use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; use super::*; - use crate::wal_storage::Storage; + use crate::{ + state::{PersistedPeers, TimelinePersistentState}, + wal_storage::Storage, + }; use std::{ops::Deref, str::FromStr, time::Instant}; // fake storage for tests struct InMemoryState { - persisted_state: SafeKeeperState, + persisted_state: TimelinePersistentState, } #[async_trait::async_trait] impl control_file::Storage for InMemoryState { - async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> { + async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { self.persisted_state = s.clone(); Ok(()) } @@ -1117,15 +981,15 @@ mod tests { } impl Deref for InMemoryState { - type Target = SafeKeeperState; + type Target = TimelinePersistentState; fn deref(&self) -> &Self::Target { &self.persisted_state } } - fn test_sk_state() -> SafeKeeperState { - let mut state = SafeKeeperState::empty(); + fn test_sk_state() -> TimelinePersistentState { + let mut state = TimelinePersistentState::empty(); state.server.wal_seg_size = WAL_SEGMENT_SIZE as u32; state.tenant_id = TenantId::from([1u8; 16]); state.timeline_id = TimelineId::from([1u8; 16]); @@ -1142,6 +1006,10 @@ mod tests { self.lsn } + async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> { + Ok(()) + } + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { self.lsn = startpos + buf.len() as u64; Ok(()) @@ -1182,7 +1050,7 @@ mod tests { } // reboot... - let state = sk.state.persisted_state.clone(); + let state = sk.state.deref().clone(); let storage = InMemoryState { persisted_state: state, }; @@ -1198,7 +1066,7 @@ mod tests { } #[tokio::test] - async fn test_epoch_switch() { + async fn test_last_log_term_switch() { let storage = InMemoryState { persisted_state: test_sk_state(), }; @@ -1208,7 +1076,7 @@ mod tests { let mut ar_hdr = AppendRequestHeader { term: 1, - epoch_start_lsn: Lsn(3), + term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), commit_lsn: Lsn(0), @@ -1233,14 +1101,14 @@ mod tests { .await .unwrap(); - // check that AppendRequest before epochStartLsn doesn't switch epoch + // check that AppendRequest before term_start_lsn doesn't switch last_log_term. let resp = sk .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) .await; assert!(resp.is_ok()); - assert_eq!(sk.get_epoch(), 0); + assert_eq!(sk.get_last_log_term(), 0); - // but record at epochStartLsn does the switch + // but record at term_start_lsn does the switch ar_hdr.begin_lsn = Lsn(2); ar_hdr.end_lsn = Lsn(3); append_request = AppendRequest { @@ -1252,7 +1120,7 @@ mod tests { .await; assert!(resp.is_ok()); sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %) - assert_eq!(sk.get_epoch(), 1); + assert_eq!(sk.get_last_log_term(), 1); } #[test] @@ -1321,7 +1189,7 @@ mod tests { use utils::Hex; let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(); let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap(); - let state = SafeKeeperState { + let state = TimelinePersistentState { tenant_id, timeline_id, acceptor_state: AcceptorState { @@ -1356,6 +1224,7 @@ mod tests { commit_lsn: Lsn(1234567600), }, )]), + partial_backup: crate::wal_backup_partial::State::default(), }; let ser = state.ser().unwrap(); @@ -1401,11 +1270,13 @@ mod tests { 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, + // partial_backup + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]; assert_eq!(Hex(&ser), Hex(&expected)); - let deser = SafeKeeperState::des(&ser).unwrap(); + let deser = TimelinePersistentState::des(&ser).unwrap(); assert_eq!(deser, state); } diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 9a5657a40d..df75893838 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -2,8 +2,10 @@ //! with the "START_REPLICATION" message, and registry of walsenders. use crate::handler::SafekeeperPostgresHandler; +use crate::metrics::RECEIVED_PS_FEEDBACKS; +use crate::receive_wal::WalReceivers; use crate::safekeeper::{Term, TermLsn}; -use crate::timeline::Timeline; +use crate::timeline::FullAccessTimeline; use crate::wal_service::ConnectionId; use crate::wal_storage::WalReader; use crate::GlobalTimelines; @@ -19,7 +21,6 @@ use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; use utils::failpoint_support; use utils::id::TenantTimelineId; -use utils::lsn::AtomicLsn; use utils::pageserver_feedback::PageserverFeedback; use std::cmp::{max, min}; @@ -84,23 +85,30 @@ impl StandbyReply { #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct StandbyFeedback { - reply: StandbyReply, - hs_feedback: HotStandbyFeedback, + pub reply: StandbyReply, + pub hs_feedback: HotStandbyFeedback, +} + +impl StandbyFeedback { + pub fn empty() -> Self { + StandbyFeedback { + reply: StandbyReply::empty(), + hs_feedback: HotStandbyFeedback::empty(), + } + } } /// WalSenders registry. Timeline holds it (wrapped in Arc). pub struct WalSenders { - /// Lsn maximized over all walsenders *and* peer data, so might be higher - /// than what we receive from replicas. - remote_consistent_lsn: AtomicLsn, mutex: Mutex, + walreceivers: Arc, } impl WalSenders { - pub fn new(remote_consistent_lsn: Lsn) -> Arc { + pub fn new(walreceivers: Arc) -> Arc { Arc::new(WalSenders { - remote_consistent_lsn: AtomicLsn::from(remote_consistent_lsn), mutex: Mutex::new(WalSendersShared::new()), + walreceivers, }) } @@ -141,29 +149,54 @@ impl WalSenders { self.mutex.lock().slots.iter().flatten().cloned().collect() } - /// Get aggregated pageserver feedback. - pub fn get_ps_feedback(self: &Arc) -> PageserverFeedback { - self.mutex.lock().agg_ps_feedback + /// Get LSN of the most lagging pageserver receiver. Return None if there are no + /// active walsenders. + pub fn laggard_lsn(self: &Arc) -> Option { + self.mutex + .lock() + .slots + .iter() + .flatten() + .filter_map(|s| match s.feedback { + ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn), + ReplicationFeedback::Standby(_) => None, + }) + .min() } - /// Get aggregated pageserver and hot standby feedback (we send them to compute). - pub fn get_feedbacks(self: &Arc) -> (PageserverFeedback, HotStandbyFeedback) { + /// Returns total counter of pageserver feedbacks received and last feedback. + pub fn get_ps_feedback_stats(self: &Arc) -> (u64, PageserverFeedback) { let shared = self.mutex.lock(); - (shared.agg_ps_feedback, shared.agg_hs_feedback) + (shared.ps_feedback_counter, shared.last_ps_feedback) + } + + /// Get aggregated hot standby feedback (we send it to compute). + pub fn get_hotstandby(self: &Arc) -> StandbyFeedback { + self.mutex.lock().agg_standby_feedback } /// Record new pageserver feedback, update aggregated values. fn record_ps_feedback(self: &Arc, id: WalSenderId, feedback: &PageserverFeedback) { let mut shared = self.mutex.lock(); shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback); - shared.update_ps_feedback(); - self.update_remote_consistent_lsn(shared.agg_ps_feedback.remote_consistent_lsn); + shared.last_ps_feedback = *feedback; + shared.ps_feedback_counter += 1; + drop(shared); + + RECEIVED_PS_FEEDBACKS.inc(); + + // send feedback to connected walproposers + self.walreceivers.broadcast_pageserver_feedback(*feedback); } /// Record standby reply. fn record_standby_reply(self: &Arc, id: WalSenderId, reply: &StandbyReply) { let mut shared = self.mutex.lock(); let slot = shared.get_slot_mut(id); + debug!( + "Record standby reply: ts={} apply_lsn={}", + reply.reply_ts, reply.apply_lsn + ); match &mut slot.feedback { ReplicationFeedback::Standby(sf) => sf.reply = *reply, ReplicationFeedback::Pageserver(_) => { @@ -188,7 +221,7 @@ impl WalSenders { }) } } - shared.update_hs_feedback(); + shared.update_reply_feedback(); } /// Get remote_consistent_lsn reported by the pageserver. Returns None if @@ -202,39 +235,30 @@ impl WalSenders { } } - /// Get remote_consistent_lsn maximized across all walsenders and peers. - pub fn get_remote_consistent_lsn(self: &Arc) -> Lsn { - self.remote_consistent_lsn.load() - } - - /// Update maximized remote_consistent_lsn, return new (potentially) value. - pub fn update_remote_consistent_lsn(self: &Arc, candidate: Lsn) -> Lsn { - self.remote_consistent_lsn - .fetch_max(candidate) - .max(candidate) - } - /// Unregister walsender. fn unregister(self: &Arc, id: WalSenderId) { let mut shared = self.mutex.lock(); shared.slots[id] = None; - shared.update_hs_feedback(); + shared.update_reply_feedback(); } } struct WalSendersShared { // aggregated over all walsenders value - agg_hs_feedback: HotStandbyFeedback, - // aggregated over all walsenders value - agg_ps_feedback: PageserverFeedback, + agg_standby_feedback: StandbyFeedback, + // last feedback ever received from any pageserver, empty if none + last_ps_feedback: PageserverFeedback, + // total counter of pageserver feedbacks received + ps_feedback_counter: u64, slots: Vec>, } impl WalSendersShared { fn new() -> Self { WalSendersShared { - agg_hs_feedback: HotStandbyFeedback::empty(), - agg_ps_feedback: PageserverFeedback::empty(), + agg_standby_feedback: StandbyFeedback::empty(), + last_ps_feedback: PageserverFeedback::empty(), + ps_feedback_counter: 0, slots: Vec::new(), } } @@ -249,10 +273,11 @@ impl WalSendersShared { self.slots[id].as_mut().expect("walsender doesn't exist") } - /// Update aggregated hot standy feedback. We just take min of valid xmins + /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins /// and ts. - fn update_hs_feedback(&mut self) { + fn update_reply_feedback(&mut self) { let mut agg = HotStandbyFeedback::empty(); + let mut reply_agg = StandbyReply::empty(); for ws_state in self.slots.iter().flatten() { if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback { let hs_feedback = standby_feedback.hs_feedback; @@ -265,7 +290,7 @@ impl WalSendersShared { } else { agg.xmin = hs_feedback.xmin; } - agg.ts = min(agg.ts, hs_feedback.ts); + agg.ts = max(agg.ts, hs_feedback.ts); } if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID { if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID { @@ -273,42 +298,43 @@ impl WalSendersShared { } else { agg.catalog_xmin = hs_feedback.catalog_xmin; } - agg.ts = min(agg.ts, hs_feedback.ts); + agg.ts = max(agg.ts, hs_feedback.ts); + } + let reply = standby_feedback.reply; + if reply.write_lsn != Lsn::INVALID { + if reply_agg.write_lsn != Lsn::INVALID { + reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn); + } else { + reply_agg.write_lsn = reply.write_lsn; + } + } + if reply.flush_lsn != Lsn::INVALID { + if reply_agg.flush_lsn != Lsn::INVALID { + reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn); + } else { + reply_agg.flush_lsn = reply.flush_lsn; + } + } + if reply.apply_lsn != Lsn::INVALID { + if reply_agg.apply_lsn != Lsn::INVALID { + reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn); + } else { + reply_agg.apply_lsn = reply.apply_lsn; + } + } + if reply.reply_ts != 0 { + if reply_agg.reply_ts != 0 { + reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts); + } else { + reply_agg.reply_ts = reply.reply_ts; + } } } } - self.agg_hs_feedback = agg; - } - - /// Update aggregated pageserver feedback. LSNs (last_received, - /// disk_consistent, remote_consistent) and reply timestamp are just - /// maximized; timeline_size if taken from feedback with highest - /// last_received lsn. This is generally reasonable, but we might want to - /// implement other policies once multiple pageservers start to be actively - /// used. - fn update_ps_feedback(&mut self) { - let init = PageserverFeedback::empty(); - let acc = - self.slots - .iter() - .flatten() - .fold(init, |mut acc, ws_state| match ws_state.feedback { - ReplicationFeedback::Pageserver(feedback) => { - if feedback.last_received_lsn > acc.last_received_lsn { - acc.current_timeline_size = feedback.current_timeline_size; - } - acc.last_received_lsn = - max(feedback.last_received_lsn, acc.last_received_lsn); - acc.disk_consistent_lsn = - max(feedback.disk_consistent_lsn, acc.disk_consistent_lsn); - acc.remote_consistent_lsn = - max(feedback.remote_consistent_lsn, acc.remote_consistent_lsn); - acc.replytime = max(feedback.replytime, acc.replytime); - acc - } - ReplicationFeedback::Standby(_) => acc, - }); - self.agg_ps_feedback = acc; + self.agg_standby_feedback = StandbyFeedback { + reply: reply_agg, + hs_feedback: agg, + }; } } @@ -360,12 +386,18 @@ impl SafekeeperPostgresHandler { start_pos: Lsn, term: Option, ) -> Result<(), QueryError> { + let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?; + let full_access = tli.full_access_guard().await?; + if let Err(end) = self - .handle_start_replication_guts(pgb, start_pos, term) + .handle_start_replication_guts(pgb, start_pos, term, full_access) .await { + let info = tli.get_safekeeper_info(&self.conf).await; // Log the result and probably send it to the client, closing the stream. - pgb.handle_copy_stream_end(end).await; + pgb.handle_copy_stream_end(end) + .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn))) + .await; } Ok(()) } @@ -375,10 +407,9 @@ impl SafekeeperPostgresHandler { pgb: &mut PostgresBackend, start_pos: Lsn, term: Option, + tli: FullAccessTimeline, ) -> Result<(), CopyStreamHandlerEnd> { let appname = self.appname.clone(); - let tli = - GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?; // Use a guard object to remove our entry from the timeline when we are done. let ws_guard = Arc::new(tli.get_walsenders().register( @@ -419,14 +450,7 @@ impl SafekeeperPostgresHandler { // switch to copy pgb.write_message(&BeMessage::CopyBothResponse).await?; - let (_, persisted_state) = tli.get_state().await; - let wal_reader = WalReader::new( - self.conf.workdir.clone(), - self.conf.timeline_dir(&tli.ttid), - &persisted_state, - start_pos, - self.conf.wal_backup_enabled, - )?; + let wal_reader = tli.get_walreader(start_pos).await?; // Split to concurrently receive and send data; replies are generally // not synchronized with sends, so this avoids deadlocks. @@ -444,13 +468,29 @@ impl SafekeeperPostgresHandler { wal_reader, send_buf: [0; MAX_SEND_SIZE], }; - let mut reply_reader = ReplyReader { reader, ws_guard }; + let mut reply_reader = ReplyReader { + reader, + ws_guard: ws_guard.clone(), + tli, + }; let res = tokio::select! { // todo: add read|write .context to these errors r = sender.run() => r, r = reply_reader.run() => r, }; + + let ws_state = ws_guard + .walsenders + .mutex + .lock() + .get_slot(ws_guard.id) + .clone(); + info!( + "finished streaming to {}, feedback={:?}", + ws_state.addr, ws_state.feedback, + ); + // Join pg backend back. pgb.unsplit(reply_reader.reader)?; @@ -487,7 +527,7 @@ impl EndWatch { /// A half driving sending WAL. struct WalSender<'a, IO> { pgb: &'a mut PostgresBackend, - tli: Arc, + tli: FullAccessTimeline, appname: Option, // Position since which we are sending next chunk. start_pos: Lsn, @@ -510,6 +550,8 @@ struct WalSender<'a, IO> { send_buf: [u8; MAX_SEND_SIZE], } +const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); + impl WalSender<'_, IO> { /// Send WAL until /// - an error occurs @@ -588,14 +630,22 @@ impl WalSender<'_, IO> { async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> { loop { self.end_pos = self.end_watch.get(); - if self.end_pos > self.start_pos { - // We have something to send. + let have_something_to_send = (|| { + fail::fail_point!( + "sk-pause-send", + self.appname.as_deref() != Some("pageserver"), + |_| { false } + ); + self.end_pos > self.start_pos + })(); + + if have_something_to_send { trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); } // Wait for WAL to appear, now self.end_pos == self.start_pos. - if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? { + if let Some(lsn) = self.wait_for_lsn().await? { self.end_pos = lsn; trace!("got end_pos {:?}, streaming", self.end_pos); return Ok(()); @@ -632,28 +682,84 @@ impl WalSender<'_, IO> { .await?; } } + + /// Wait until we have available WAL > start_pos or timeout expires. Returns + /// - Ok(Some(end_pos)) if needed lsn is successfully observed; + /// - Ok(None) if timeout expired; + /// - Err in case of error -- only if 1) term changed while fetching in recovery + /// mode 2) watch channel closed, which must never happen. + async fn wait_for_lsn(&mut self) -> anyhow::Result> { + let fp = (|| { + fail::fail_point!( + "sk-pause-send", + self.appname.as_deref() != Some("pageserver"), + |_| { true } + ); + false + })(); + if fp { + tokio::time::sleep(POLL_STATE_TIMEOUT).await; + return Ok(None); + } + + let res = timeout(POLL_STATE_TIMEOUT, async move { + loop { + let end_pos = self.end_watch.get(); + if end_pos > self.start_pos { + return Ok(end_pos); + } + if let EndWatch::Flush(rx) = &self.end_watch { + let curr_term = rx.borrow().term; + if let Some(client_term) = self.term { + if curr_term != client_term { + bail!("term changed: requested {}, now {}", client_term, curr_term); + } + } + } + self.end_watch.changed().await?; + } + }) + .await; + + match res { + // success + Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), + // error inside closure + Ok(Err(err)) => Err(err), + // timeout + Err(_) => Ok(None), + } + } } /// A half driving receiving replies. struct ReplyReader { reader: PostgresBackendReader, ws_guard: Arc, + tli: FullAccessTimeline, } impl ReplyReader { async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> { loop { let msg = self.reader.read_copy_message().await?; - self.handle_feedback(&msg)? + self.handle_feedback(&msg).await? } } - fn handle_feedback(&mut self, msg: &Bytes) -> anyhow::Result<()> { + async fn handle_feedback(&mut self, msg: &Bytes) -> anyhow::Result<()> { match msg.first().cloned() { Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => { // Note: deserializing is on m[1..] because we skip the tag byte. - let hs_feedback = HotStandbyFeedback::des(&msg[1..]) + let mut hs_feedback = HotStandbyFeedback::des(&msg[1..]) .context("failed to deserialize HotStandbyFeedback")?; + // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way: + // pq_sendint32(&reply_message, xmin); + // pq_sendint32(&reply_message, xmin_epoch); + // So it is two big endian 32-bit words in low endian order! + hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32); + hs_feedback.catalog_xmin = + (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32); self.ws_guard .walsenders .record_hs_feedback(self.ws_guard.id, &hs_feedback); @@ -675,6 +781,9 @@ impl ReplyReader { self.ws_guard .walsenders .record_ps_feedback(self.ws_guard.id, &ps_feedback); + self.tli + .update_remote_consistent_lsn(ps_feedback.remote_consistent_lsn) + .await; // in principle new remote_consistent_lsn could allow to // deactivate the timeline, but we check that regularly through // broker updated, not need to do it here @@ -685,50 +794,8 @@ impl ReplyReader { } } -const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1); - -/// Wait until we have available WAL > start_pos or timeout expires. Returns -/// - Ok(Some(end_pos)) if needed lsn is successfully observed; -/// - Ok(None) if timeout expired; -/// - Err in case of error -- only if 1) term changed while fetching in recovery -/// mode 2) watch channel closed, which must never happen. -async fn wait_for_lsn( - rx: &mut EndWatch, - client_term: Option, - start_pos: Lsn, -) -> anyhow::Result> { - let res = timeout(POLL_STATE_TIMEOUT, async move { - loop { - let end_pos = rx.get(); - if end_pos > start_pos { - return Ok(end_pos); - } - if let EndWatch::Flush(rx) = rx { - let curr_term = rx.borrow().term; - if let Some(client_term) = client_term { - if curr_term != client_term { - bail!("term changed: requested {}, now {}", client_term, curr_term); - } - } - } - rx.changed().await?; - } - }) - .await; - - match res { - // success - Ok(Ok(commit_lsn)) => Ok(Some(commit_lsn)), - // error inside closure - Ok(Err(err)) => Err(err), - // timeout - Err(_) => Ok(None), - } -} - #[cfg(test)] mod tests { - use postgres_protocol::PG_EPOCH; use utils::id::{TenantId, TimelineId}; use super::*; @@ -774,8 +841,11 @@ mod tests { fn test_hs_feedback_no_valid() { let mut wss = WalSendersShared::new(); push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID)); - wss.update_hs_feedback(); - assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID); + wss.update_reply_feedback(); + assert_eq!( + wss.agg_standby_feedback.hs_feedback.xmin, + INVALID_FULL_TRANSACTION_ID + ); } #[test] @@ -784,30 +854,7 @@ mod tests { push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID)); push_feedback(&mut wss, hs_feedback(1, 42)); push_feedback(&mut wss, hs_feedback(1, 64)); - wss.update_hs_feedback(); - assert_eq!(wss.agg_hs_feedback.xmin, 42); - } - - // form pageserver feedback with given last_record_lsn / tli size and the - // rest set to dummy values. - fn ps_feedback(current_timeline_size: u64, last_received_lsn: Lsn) -> ReplicationFeedback { - ReplicationFeedback::Pageserver(PageserverFeedback { - current_timeline_size, - last_received_lsn, - disk_consistent_lsn: Lsn::INVALID, - remote_consistent_lsn: Lsn::INVALID, - replytime: *PG_EPOCH, - }) - } - - // test that ps aggregation works as expected - #[test] - fn test_ps_feedback() { - let mut wss = WalSendersShared::new(); - push_feedback(&mut wss, ps_feedback(8, Lsn(42))); - push_feedback(&mut wss, ps_feedback(4, Lsn(84))); - wss.update_ps_feedback(); - assert_eq!(wss.agg_ps_feedback.current_timeline_size, 4); - assert_eq!(wss.agg_ps_feedback.last_received_lsn, Lsn(84)); + wss.update_reply_feedback(); + assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42); } } diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs new file mode 100644 index 0000000000..be5e516296 --- /dev/null +++ b/safekeeper/src/state.rs @@ -0,0 +1,202 @@ +//! Defines per timeline data stored persistently (SafeKeeperPersistentState) +//! and its wrapper with in memory layer (SafekeeperState). + +use std::ops::Deref; + +use anyhow::Result; +use serde::{Deserialize, Serialize}; +use utils::{ + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use crate::{ + control_file, + safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory}, + wal_backup_partial::{self}, +}; + +/// Persistent information stored on safekeeper node about timeline. +/// On disk data is prefixed by magic and format version and followed by checksum. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct TimelinePersistentState { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). + pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>); + +impl TimelinePersistentState { + pub fn new( + ttid: &TenantTimelineId, + server_info: ServerInfo, + peers: Vec, + commit_lsn: Lsn, + local_start_lsn: Lsn, + ) -> TimelinePersistentState { + TimelinePersistentState { + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, + acceptor_state: AcceptorState { + term: 0, + term_history: TermHistory::empty(), + }, + server: server_info, + proposer_uuid: [0; 16], + timeline_start_lsn: Lsn(0), + local_start_lsn, + commit_lsn, + backup_lsn: local_start_lsn, + peer_horizon_lsn: local_start_lsn, + remote_consistent_lsn: Lsn(0), + peers: PersistedPeers( + peers + .iter() + .map(|p| (*p, PersistedPeerInfo::new())) + .collect(), + ), + partial_backup: wal_backup_partial::State::default(), + } + } + + #[cfg(test)] + pub fn empty() -> Self { + use crate::safekeeper::UNKNOWN_SERVER_VERSION; + + TimelinePersistentState::new( + &TenantTimelineId::empty(), + ServerInfo { + pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ + system_id: 0, /* Postgres system identifier */ + wal_seg_size: 0, + }, + vec![], + Lsn::INVALID, + Lsn::INVALID, + ) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +// In memory safekeeper state. Fields mirror ones in `SafeKeeperPersistentState`; values +// are not flushed yet. +pub struct TimelineMemState { + pub commit_lsn: Lsn, + pub backup_lsn: Lsn, + pub peer_horizon_lsn: Lsn, + pub remote_consistent_lsn: Lsn, + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, +} + +/// Safekeeper persistent state plus in memory layer, to avoid frequent fsyncs +/// when we update fields like commit_lsn which don't need immediate +/// persistence. Provides transactional like API to atomically update the state. +/// +/// Implements Deref into *persistent* part. +pub struct TimelineState { + pub inmem: TimelineMemState, + pub pers: CTRL, // persistent +} + +impl TimelineState +where + CTRL: control_file::Storage, +{ + pub fn new(state: CTRL) -> Self { + TimelineState { + inmem: TimelineMemState { + commit_lsn: state.commit_lsn, + backup_lsn: state.backup_lsn, + peer_horizon_lsn: state.peer_horizon_lsn, + remote_consistent_lsn: state.remote_consistent_lsn, + proposer_uuid: state.proposer_uuid, + }, + pers: state, + } + } + + /// Start atomic change. Returns SafeKeeperPersistentState with in memory + /// values applied; the protocol is to 1) change returned struct as desired + /// 2) atomically persist it with finish_change. + pub fn start_change(&self) -> TimelinePersistentState { + let mut s = self.pers.clone(); + s.commit_lsn = self.inmem.commit_lsn; + s.backup_lsn = self.inmem.backup_lsn; + s.peer_horizon_lsn = self.inmem.peer_horizon_lsn; + s.remote_consistent_lsn = self.inmem.remote_consistent_lsn; + s.proposer_uuid = self.inmem.proposer_uuid; + s + } + + /// Persist given state. c.f. start_change. + pub async fn finish_change(&mut self, s: &TimelinePersistentState) -> Result<()> { + self.pers.persist(s).await?; + // keep in memory values up to date + self.inmem.commit_lsn = s.commit_lsn; + self.inmem.backup_lsn = s.backup_lsn; + self.inmem.peer_horizon_lsn = s.peer_horizon_lsn; + self.inmem.remote_consistent_lsn = s.remote_consistent_lsn; + self.inmem.proposer_uuid = s.proposer_uuid; + Ok(()) + } + + /// Flush in memory values. + pub async fn flush(&mut self) -> Result<()> { + let s = self.start_change(); + self.finish_change(&s).await + } +} + +impl Deref for TimelineState +where + CTRL: control_file::Storage, +{ + type Target = TimelinePersistentState; + + fn deref(&self) -> &Self::Target { + &self.pers + } +} diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2f284abe8c..544ffdbb36 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -3,18 +3,18 @@ use anyhow::{anyhow, bail, Result}; use camino::Utf8PathBuf; -use postgres_ffi::XLogSegNo; use serde::{Deserialize, Serialize}; -use tokio::fs; +use tokio::fs::{self}; +use tokio_util::sync::CancellationToken; +use utils::id::TenantId; use std::cmp::max; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::Arc; use std::time::Duration; -use tokio::sync::{Mutex, MutexGuard}; -use tokio::{ - sync::{mpsc::Sender, watch}, - time::Instant, -}; +use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; +use tokio::{sync::watch, time::Instant}; use tracing::*; use utils::http::error::ApiError; use utils::{ @@ -26,18 +26,20 @@ use storage_broker::proto::SafekeeperTimelineInfo; use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId; use crate::receive_wal::WalReceivers; -use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo}; use crate::safekeeper::{ - AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, - SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM, + AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn, + INVALID_TERM, }; use crate::send_wal::WalSenders; +use crate::state::{TimelineMemState, TimelinePersistentState}; +use crate::timelines_set::TimelinesSet; +use crate::wal_backup::{self}; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; -use crate::wal_storage::Storage as wal_storage_iface; -use crate::SafeKeeperConf; -use crate::{debug_dump, wal_storage}; +use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; +use crate::{debug_dump, timeline_manager, wal_storage}; +use crate::{GlobalTimelines, SafeKeeperConf}; /// Things safekeeper should know about timeline state on peers. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -49,8 +51,7 @@ pub struct PeerInfo { /// LSN of the last record. pub flush_lsn: Lsn, pub commit_lsn: Lsn, - /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new - /// sk since backup_lsn. + /// Since which LSN safekeeper has WAL. pub local_start_lsn: Lsn, /// When info was received. Serde annotations are not very useful but make /// the code compile -- we don't rely on this field externally. @@ -95,25 +96,81 @@ impl PeersInfo { } } +pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>; + +/// WriteGuardSharedState is a wrapper around `RwLockWriteGuard` that +/// automatically updates `watch::Sender` channels with state on drop. +pub struct WriteGuardSharedState<'a> { + tli: Arc, + guard: RwLockWriteGuard<'a, SharedState>, + skip_update: bool, +} + +impl<'a> WriteGuardSharedState<'a> { + fn new(tli: Arc, guard: RwLockWriteGuard<'a, SharedState>) -> Self { + WriteGuardSharedState { + tli, + guard, + skip_update: false, + } + } +} + +impl<'a> Deref for WriteGuardSharedState<'a> { + type Target = SharedState; + + fn deref(&self) -> &Self::Target { + &self.guard + } +} + +impl<'a> DerefMut for WriteGuardSharedState<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.guard + } +} + +impl<'a> Drop for WriteGuardSharedState<'a> { + fn drop(&mut self) { + let term_flush_lsn = TermLsn::from((self.guard.sk.get_term(), self.guard.sk.flush_lsn())); + let commit_lsn = self.guard.sk.state.inmem.commit_lsn; + + let _ = self.tli.term_flush_lsn_watch_tx.send_if_modified(|old| { + if *old != term_flush_lsn { + *old = term_flush_lsn; + true + } else { + false + } + }); + + let _ = self.tli.commit_lsn_watch_tx.send_if_modified(|old| { + if *old != commit_lsn { + *old = commit_lsn; + true + } else { + false + } + }); + + if !self.skip_update { + // send notification about shared state update + self.tli.shared_state_version_tx.send_modify(|old| { + *old += 1; + }); + } + } +} + /// Shared state associated with database instance pub struct SharedState { /// Safekeeper object - sk: SafeKeeper, + pub(crate) sk: SafeKeeper, /// In memory list containing state of peers sent in latest messages from them. - peers_info: PeersInfo, - /// True when WAL backup launcher oversees the timeline, making sure WAL is - /// offloaded, allows to bother launcher less. - wal_backup_active: bool, - /// True whenever there is at least some pending activity on timeline: live - /// compute connection, pageserver is not caughtup (it must have latest WAL - /// for new compute start) or WAL backuping is not finished. Practically it - /// means safekeepers broadcast info to peers about the timeline, old WAL is - /// trimmed. - /// - /// TODO: it might be better to remove tli completely from GlobalTimelines - /// when tli is inactive instead of having this flag. - active: bool, - last_removed_segno: XLogSegNo, + pub(crate) peers_info: PeersInfo, + // True value hinders old WAL removal; this is used by snapshotting. We + // could make it a counter, but there is no need to. + pub(crate) wal_removal_on_hold: bool, } impl SharedState { @@ -121,7 +178,7 @@ impl SharedState { fn create_new( conf: &SafeKeeperConf, ttid: &TenantTimelineId, - state: SafeKeeperState, + state: TimelinePersistentState, ) -> Result { if state.server.wal_seg_size == 0 { bail!(TimelineError::UninitializedWalSegSize(*ttid)); @@ -141,106 +198,39 @@ impl SharedState { // We don't want to write anything to disk, because we may have existing timeline there. // These functions should not change anything on disk. - let timeline_dir = conf.timeline_dir(ttid); - let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?; + let timeline_dir = get_timeline_dir(conf, ttid); + let control_store = + control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?; let wal_store = - wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?; + wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; Ok(Self { sk, peers_info: PeersInfo(vec![]), - wal_backup_active: false, - active: false, - last_removed_segno: 0, + wal_removal_on_hold: false, }) } /// Restore SharedState from control file. If file doesn't exist, bails out. fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result { + let timeline_dir = get_timeline_dir(conf, ttid); let control_store = control_file::FileStorage::restore_new(ttid, conf)?; if control_store.server.wal_seg_size == 0 { bail!(TimelineError::UninitializedWalSegSize(*ttid)); } let wal_store = - wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?; + wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?; Ok(Self { sk: SafeKeeper::new(control_store, wal_store, conf.my_id)?, peers_info: PeersInfo(vec![]), - wal_backup_active: false, - active: false, - last_removed_segno: 0, + wal_removal_on_hold: false, }) } - fn is_active(&self, num_computes: usize, remote_consistent_lsn: Lsn) -> bool { - self.is_wal_backup_required(num_computes) - // FIXME: add tracking of relevant pageservers and check them here individually, - // otherwise migration won't work (we suspend too early). - || remote_consistent_lsn < self.sk.inmem.commit_lsn - } - - /// Mark timeline active/inactive and return whether s3 offloading requires - /// start/stop action. If timeline is deactivated, control file is persisted - /// as maintenance task does that only for active timelines. - async fn update_status( - &mut self, - num_computes: usize, - remote_consistent_lsn: Lsn, - ttid: TenantTimelineId, - ) -> bool { - let is_active = self.is_active(num_computes, remote_consistent_lsn); - if self.active != is_active { - info!( - "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}", - ttid, is_active, remote_consistent_lsn, self.sk.inmem.commit_lsn - ); - if !is_active { - if let Err(e) = self.sk.persist_inmem(remote_consistent_lsn).await { - warn!("control file save in update_status failed: {:?}", e); - } - } - } - self.active = is_active; - self.is_wal_backup_action_pending(num_computes) - } - - /// Should we run s3 offloading in current state? - fn is_wal_backup_required(&self, num_computes: usize) -> bool { - let seg_size = self.get_wal_seg_size(); - num_computes > 0 || - // Currently only the whole segment is offloaded, so compare segment numbers. - (self.sk.inmem.commit_lsn.segment_number(seg_size) > - self.sk.inmem.backup_lsn.segment_number(seg_size)) - } - - /// Is current state of s3 offloading is not what it ought to be? - fn is_wal_backup_action_pending(&self, num_computes: usize) -> bool { - let res = self.wal_backup_active != self.is_wal_backup_required(num_computes); - if res { - let action_pending = if self.is_wal_backup_required(num_computes) { - "start" - } else { - "stop" - }; - trace!( - "timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}", - self.sk.state.timeline_id, action_pending, num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn - ); - } - res - } - - /// Returns whether s3 offloading is required and sets current status as - /// matching. - fn wal_backup_attend(&mut self, num_computes: usize) -> bool { - self.wal_backup_active = self.is_wal_backup_required(num_computes); - self.wal_backup_active - } - - fn get_wal_seg_size(&self) -> usize { + pub(crate) fn get_wal_seg_size(&self) -> usize { self.sk.state.server.wal_seg_size as usize } @@ -248,7 +238,7 @@ impl SharedState { &self, ttid: &TenantTimelineId, conf: &SafeKeeperConf, - remote_consistent_lsn: Lsn, + standby_apply_lsn: Lsn, ) -> SafekeeperTimelineInfo { SafekeeperTimelineInfo { safekeeper_id: conf.my_id.0, @@ -257,27 +247,28 @@ impl SharedState { timeline_id: ttid.timeline_id.as_ref().to_owned(), }), term: self.sk.state.acceptor_state.term, - last_log_term: self.sk.get_epoch(), + last_log_term: self.sk.get_last_log_term(), flush_lsn: self.sk.flush_lsn().0, // note: this value is not flushed to control file yet and can be lost - commit_lsn: self.sk.inmem.commit_lsn.0, - remote_consistent_lsn: remote_consistent_lsn.0, - peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0, + commit_lsn: self.sk.state.inmem.commit_lsn.0, + remote_consistent_lsn: self.sk.state.inmem.remote_consistent_lsn.0, + peer_horizon_lsn: self.sk.state.inmem.peer_horizon_lsn.0, safekeeper_connstr: conf .advertise_pg_addr .to_owned() .unwrap_or(conf.listen_pg_addr.clone()), http_connstr: conf.listen_http_addr.to_owned(), - backup_lsn: self.sk.inmem.backup_lsn.0, + backup_lsn: self.sk.state.inmem.backup_lsn.0, local_start_lsn: self.sk.state.local_start_lsn.0, availability_zone: conf.availability_zone.clone(), + standby_horizon: standby_apply_lsn.0, } } /// Get our latest view of alive peers status on the timeline. /// We pass our own info through the broker as well, so when we don't have connection /// to the broker returned vec is empty. - fn get_peers(&self, heartbeat_timeout: Duration) -> Vec { + pub(crate) fn get_peers(&self, heartbeat_timeout: Duration) -> Vec { let now = Instant::now(); self.peers_info .0 @@ -322,11 +313,6 @@ impl From for ApiError { pub struct Timeline { pub ttid: TenantTimelineId, - /// Sending here asks for wal backup launcher attention (start/stop - /// offloading). Sending ttid instead of concrete command allows to do - /// sending without timeline lock. - pub wal_backup_launcher_tx: Sender, - /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, commit_lsn_watch_rx: watch::Receiver, @@ -338,56 +324,58 @@ pub struct Timeline { term_flush_lsn_watch_tx: watch::Sender, term_flush_lsn_watch_rx: watch::Receiver, + /// Broadcasts shared state updates. + shared_state_version_tx: watch::Sender, + shared_state_version_rx: watch::Receiver, + /// Safekeeper and other state, that should remain consistent and /// synchronized with the disk. This is tokio mutex as we write WAL to disk /// while holding it, ensuring that consensus checks are in order. - mutex: Mutex, + mutex: RwLock, walsenders: Arc, walreceivers: Arc, + timeline_dir: Utf8PathBuf, - /// Cancellation channel. Delete/cancel will send `true` here as a cancellation signal. - cancellation_tx: watch::Sender, + /// Delete/cancel will trigger this, background tasks should drop out as soon as it fires + pub(crate) cancel: CancellationToken, - /// Timeline should not be used after cancellation. Background tasks should - /// monitor this channel and stop eventually after receiving `true` from this channel. - cancellation_rx: watch::Receiver, - - /// Directory where timeline state is stored. - pub timeline_dir: Utf8PathBuf, + // timeline_manager controlled state + pub(crate) broker_active: AtomicBool, + pub(crate) wal_backup_active: AtomicBool, + pub(crate) last_removed_segno: AtomicU64, } impl Timeline { /// Load existing timeline from disk. - pub fn load_timeline( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - wal_backup_launcher_tx: Sender, - ) -> Result { + pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result { let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); let shared_state = SharedState::restore(conf, &ttid)?; - let rcl = shared_state.sk.state.remote_consistent_lsn; let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(shared_state.sk.state.commit_lsn); let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from(( shared_state.sk.get_term(), shared_state.sk.flush_lsn(), ))); - let (cancellation_tx, cancellation_rx) = watch::channel(false); + let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); + let walreceivers = WalReceivers::new(); Ok(Timeline { ttid, - wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, - mutex: Mutex::new(shared_state), - walsenders: WalSenders::new(rcl), - walreceivers: WalReceivers::new(), - cancellation_rx, - cancellation_tx, - timeline_dir: conf.timeline_dir(&ttid), + shared_state_version_tx, + shared_state_version_rx, + mutex: RwLock::new(shared_state), + walsenders: WalSenders::new(walreceivers.clone()), + walreceivers, + cancel: CancellationToken::default(), + timeline_dir: get_timeline_dir(conf, &ttid), + broker_active: AtomicBool::new(false), + wal_backup_active: AtomicBool::new(false), + last_removed_segno: AtomicU64::new(0), }) } @@ -395,7 +383,6 @@ impl Timeline { pub fn create_empty( conf: &SafeKeeperConf, ttid: TenantTimelineId, - wal_backup_launcher_tx: Sender, server_info: ServerInfo, commit_lsn: Lsn, local_start_lsn: Lsn, @@ -403,22 +390,28 @@ impl Timeline { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID); let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID))); - let (cancellation_tx, cancellation_rx) = watch::channel(false); - let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); + let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); + let state = + TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); + + let walreceivers = WalReceivers::new(); Ok(Timeline { ttid, - wal_backup_launcher_tx, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, term_flush_lsn_watch_rx, - mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?), - walsenders: WalSenders::new(Lsn(0)), - walreceivers: WalReceivers::new(), - cancellation_rx, - cancellation_tx, - timeline_dir: conf.timeline_dir(&ttid), + shared_state_version_tx, + shared_state_version_rx, + mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?), + walsenders: WalSenders::new(walreceivers.clone()), + walreceivers, + cancel: CancellationToken::default(), + timeline_dir: get_timeline_dir(conf, &ttid), + broker_active: AtomicBool::new(false), + wal_backup_active: AtomicBool::new(false), + last_removed_segno: AtomicU64::new(0), }) } @@ -429,8 +422,9 @@ impl Timeline { /// and state on disk should remain unchanged. pub async fn init_new( self: &Arc, - shared_state: &mut MutexGuard<'_, SharedState>, + shared_state: &mut WriteGuardSharedState<'_>, conf: &SafeKeeperConf, + broker_active_set: Arc, ) -> Result<()> { match fs::metadata(&self.timeline_dir).await { Ok(_) => { @@ -448,7 +442,7 @@ impl Timeline { fs::create_dir_all(&self.timeline_dir).await?; // Write timeline to disk and start background tasks. - if let Err(e) = shared_state.sk.persist_inmem(Lsn::INVALID).await { + if let Err(e) = shared_state.sk.state.flush().await { // Bootstrap failed, cancel timeline and remove timeline directory. self.cancel(shared_state); @@ -461,41 +455,56 @@ impl Timeline { return Err(e); } - self.bootstrap(conf); + self.bootstrap(conf, broker_active_set); Ok(()) } - /// Bootstrap new or existing timeline starting background stasks. - pub fn bootstrap(self: &Arc, conf: &SafeKeeperConf) { - // Start recovery task which always runs on the timeline. - if conf.peer_recovery_enabled { - tokio::spawn(recovery_main(self.clone(), conf.clone())); - } + /// Bootstrap new or existing timeline starting background tasks. + pub fn bootstrap( + self: &Arc, + conf: &SafeKeeperConf, + broker_active_set: Arc, + ) { + // Start manager task which will monitor timeline state and update + // background tasks. + tokio::spawn(timeline_manager::main_task( + self.clone(), + conf.clone(), + broker_active_set, + )); } - /// Delete timeline from disk completely, by removing timeline directory. Background - /// timeline activities will stop eventually. - pub async fn delete_from_disk( + /// Delete timeline from disk completely, by removing timeline directory. + /// Background timeline activities will stop eventually. + /// + /// Also deletes WAL in s3. Might fail if e.g. s3 is unavailable, but + /// deletion API endpoint is retriable. + pub async fn delete( &self, - shared_state: &mut MutexGuard<'_, SharedState>, - ) -> Result<(bool, bool)> { - let was_active = shared_state.active; + shared_state: &mut WriteGuardSharedState<'_>, + only_local: bool, + ) -> Result { self.cancel(shared_state); + + // TODO: It's better to wait for s3 offloader termination before + // removing data from s3. Though since s3 doesn't have transactions it + // still wouldn't guarantee absense of data after removal. + let conf = GlobalTimelines::get_global_config(); + if !only_local && conf.is_wal_backup_enabled() { + // Note: we concurrently delete remote storage data from multiple + // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we + // do some retries anyway. + wal_backup::delete_timeline(&self.ttid).await?; + } let dir_existed = delete_dir(&self.timeline_dir).await?; - Ok((dir_existed, was_active)) + Ok(dir_existed) } /// Cancel timeline to prevent further usage. Background tasks will stop /// eventually after receiving cancellation signal. - /// - /// Note that we can't notify backup launcher here while holding - /// shared_state lock, as this is a potential deadlock: caller is - /// responsible for that. Generally we should probably make WAL backup tasks - /// to shut down on their own, checking once in a while whether it is the - /// time. - fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) { + fn cancel(&self, shared_state: &mut WriteGuardSharedState<'_>) { info!("timeline {} is cancelled", self.ttid); - let _ = self.cancellation_tx.send(true); + self.cancel.cancel(); // Close associated FDs. Nobody will be able to touch timeline data once // it is cancelled, so WAL storage won't be opened again. shared_state.sk.wal_store.close(); @@ -503,90 +512,16 @@ impl Timeline { /// Returns if timeline is cancelled. pub fn is_cancelled(&self) -> bool { - *self.cancellation_rx.borrow() - } - - /// Returns watch channel which gets value when timeline is cancelled. It is - /// guaranteed to have not cancelled value observed (errors otherwise). - pub fn get_cancellation_rx(&self) -> Result> { - let rx = self.cancellation_rx.clone(); - if *rx.borrow() { - bail!(TimelineError::Cancelled(self.ttid)); - } - Ok(rx) + self.cancel.is_cancelled() } /// Take a writing mutual exclusive lock on timeline shared_state. - pub async fn write_shared_state(&self) -> MutexGuard { - self.mutex.lock().await + pub async fn write_shared_state<'a>(self: &'a Arc) -> WriteGuardSharedState<'a> { + WriteGuardSharedState::new(self.clone(), self.mutex.write().await) } - async fn update_status(&self, shared_state: &mut SharedState) -> bool { - shared_state - .update_status( - self.walreceivers.get_num(), - self.get_walsenders().get_remote_consistent_lsn(), - self.ttid, - ) - .await - } - - /// Update timeline status and kick wal backup launcher to stop/start offloading if needed. - pub async fn update_status_notify(&self) -> Result<()> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - let is_wal_backup_action_pending: bool = { - let mut shared_state = self.write_shared_state().await; - self.update_status(&mut shared_state).await - }; - if is_wal_backup_action_pending { - // Can fail only if channel to a static thread got closed, which is not normal at all. - self.wal_backup_launcher_tx.send(self.ttid).await?; - } - Ok(()) - } - - /// Returns true if walsender should stop sending WAL to pageserver. We - /// terminate it if remote_consistent_lsn reached commit_lsn and there is no - /// computes. While there might be nothing to stream already, we learn about - /// remote_consistent_lsn update through replication feedback, and we want - /// to stop pushing to the broker if pageserver is fully caughtup. - pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { - if self.is_cancelled() { - return true; - } - let shared_state = self.write_shared_state().await; - if self.walreceivers.get_num() == 0 { - return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet - reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn; - } - false - } - - /// Ensure taht current term is t, erroring otherwise, and lock the state. - pub async fn acquire_term(&self, t: Term) -> Result> { - let ss = self.write_shared_state().await; - if ss.sk.state.acceptor_state.term != t { - bail!( - "failed to acquire term {}, current term {}", - t, - ss.sk.state.acceptor_state.term - ); - } - Ok(ss) - } - - /// Returns whether s3 offloading is required and sets current status as - /// matching it. - pub async fn wal_backup_attend(&self) -> bool { - if self.is_cancelled() { - return false; - } - - self.write_shared_state() - .await - .wal_backup_attend(self.walreceivers.get_num()) + pub async fn read_shared_state(&self) -> ReadGuardSharedState { + self.mutex.read().await } /// Returns commit_lsn watch channel. @@ -599,72 +534,35 @@ impl Timeline { self.term_flush_lsn_watch_rx.clone() } - /// Pass arrived message to the safekeeper. - pub async fn process_msg( - &self, - msg: &ProposerAcceptorMessage, - ) -> Result> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - - let mut rmsg: Option; - let commit_lsn: Lsn; - let term_flush_lsn: TermLsn; - { - let mut shared_state = self.write_shared_state().await; - rmsg = shared_state.sk.process_msg(msg).await?; - - // if this is AppendResponse, fill in proper pageserver and hot - // standby feedback. - if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { - let (ps_feedback, hs_feedback) = self.walsenders.get_feedbacks(); - resp.hs_feedback = hs_feedback; - resp.pageserver_feedback = ps_feedback; - } - - commit_lsn = shared_state.sk.inmem.commit_lsn; - term_flush_lsn = - TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn())); - } - self.commit_lsn_watch_tx.send(commit_lsn)?; - self.term_flush_lsn_watch_tx.send(term_flush_lsn)?; - Ok(rmsg) + /// Returns watch channel for SharedState update version. + pub fn get_state_version_rx(&self) -> watch::Receiver { + self.shared_state_version_rx.clone() } /// Returns wal_seg_size. pub async fn get_wal_seg_size(&self) -> usize { - self.write_shared_state().await.get_wal_seg_size() - } - - /// Returns true only if the timeline is loaded and active. - pub async fn is_active(&self) -> bool { - if self.is_cancelled() { - return false; - } - - self.write_shared_state().await.active + self.read_shared_state().await.get_wal_seg_size() } /// Returns state of the timeline. - pub async fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) { - let state = self.write_shared_state().await; - (state.sk.inmem.clone(), state.sk.state.clone()) + pub async fn get_state(&self) -> (TimelineMemState, TimelinePersistentState) { + let state = self.read_shared_state().await; + (state.sk.state.inmem.clone(), state.sk.state.clone()) } /// Returns latest backup_lsn. pub async fn get_wal_backup_lsn(&self) -> Lsn { - self.write_shared_state().await.sk.inmem.backup_lsn + self.read_shared_state().await.sk.state.inmem.backup_lsn } /// Sets backup_lsn to the given value. - pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> { + pub async fn set_wal_backup_lsn(self: &Arc, backup_lsn: Lsn) -> Result<()> { if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } let mut state = self.write_shared_state().await; - state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn); + state.sk.state.inmem.backup_lsn = max(state.sk.state.inmem.backup_lsn, backup_lsn); // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. Ok(()) @@ -672,124 +570,30 @@ impl Timeline { /// Get safekeeper info for broadcasting to broker and other peers. pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo { - let shared_state = self.write_shared_state().await; - shared_state.get_safekeeper_info( - &self.ttid, - conf, - self.walsenders.get_remote_consistent_lsn(), - ) + let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn; + let shared_state = self.read_shared_state().await; + shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn) } /// Update timeline state with peer safekeeper data. - pub async fn record_safekeeper_info(&self, mut sk_info: SafekeeperTimelineInfo) -> Result<()> { - // Update local remote_consistent_lsn in memory (in .walsenders) and in - // sk_info to pass it down to control file. - sk_info.remote_consistent_lsn = self - .walsenders - .update_remote_consistent_lsn(Lsn(sk_info.remote_consistent_lsn)) - .0; - let is_wal_backup_action_pending: bool; - let commit_lsn: Lsn; + pub async fn record_safekeeper_info( + self: &Arc, + sk_info: SafekeeperTimelineInfo, + ) -> Result<()> { { let mut shared_state = self.write_shared_state().await; shared_state.sk.record_safekeeper_info(&sk_info).await?; let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now()); shared_state.peers_info.upsert(&peer_info); - is_wal_backup_action_pending = self.update_status(&mut shared_state).await; - commit_lsn = shared_state.sk.inmem.commit_lsn; - } - self.commit_lsn_watch_tx.send(commit_lsn)?; - // Wake up wal backup launcher, if it is time to stop the offloading. - if is_wal_backup_action_pending { - self.wal_backup_launcher_tx.send(self.ttid).await?; } Ok(()) } pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec { - let shared_state = self.write_shared_state().await; + let shared_state = self.read_shared_state().await; shared_state.get_peers(conf.heartbeat_timeout) } - /// Should we start fetching WAL from a peer safekeeper, and if yes, from - /// which? Answer is yes, i.e. .donors is not empty if 1) there is something - /// to fetch, and we can do that without running elections; 2) there is no - /// actively streaming compute, as we don't want to compete with it. - /// - /// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal - /// to its last_log_term so we are sure such a leader ever had been elected. - /// - /// All possible donors are returned so that we could keep connection to the - /// current one if it is good even if it slightly lags behind. - /// - /// Note that term conditions above might be not met, but safekeepers are - /// still not aligned on last flush_lsn. Generally in this case until - /// elections are run it is not possible to say which safekeeper should - /// recover from which one -- history which would be committed is different - /// depending on assembled quorum (e.g. classic picture 8 from Raft paper). - /// Thus we don't try to predict it here. - pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo { - let ss = self.write_shared_state().await; - let term = ss.sk.state.acceptor_state.term; - let last_log_term = ss.sk.get_epoch(); - let flush_lsn = ss.sk.flush_lsn(); - // note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us. - let mut peers = ss.get_peers(heartbeat_timeout); - // Sort by pairs. - peers.sort_by(|p1, p2| { - let tl1 = TermLsn { - term: p1.last_log_term, - lsn: p1.flush_lsn, - }; - let tl2 = TermLsn { - term: p2.last_log_term, - lsn: p2.flush_lsn, - }; - tl2.cmp(&tl1) // desc - }); - let num_streaming_computes = self.walreceivers.get_num_streaming(); - let donors = if num_streaming_computes > 0 { - vec![] // If there is a streaming compute, don't try to recover to not intervene. - } else { - peers - .iter() - .filter_map(|candidate| { - // Are we interested in this candidate? - let candidate_tl = TermLsn { - term: candidate.last_log_term, - lsn: candidate.flush_lsn, - }; - let my_tl = TermLsn { - term: last_log_term, - lsn: flush_lsn, - }; - if my_tl < candidate_tl { - // Yes, we are interested. Can we pull from it without - // (re)running elections? It is possible if 1) his term - // is equal to his last_log_term so we could act on - // behalf of leader of this term (we must be sure he was - // ever elected) and 2) our term is not higher, or we'll refuse data. - if candidate.term == candidate.last_log_term && candidate.term >= term { - Some(Donor::from(candidate)) - } else { - None - } - } else { - None - } - }) - .collect() - }; - RecoveryNeededInfo { - term, - last_log_term, - flush_lsn, - peers, - num_streaming_computes, - donors, - } - } - pub fn get_walsenders(&self) -> &Arc { &self.walsenders } @@ -800,82 +604,36 @@ impl Timeline { /// Returns flush_lsn. pub async fn get_flush_lsn(&self) -> Lsn { - self.write_shared_state().await.sk.wal_store.flush_lsn() + self.read_shared_state().await.sk.wal_store.flush_lsn() } - /// Delete WAL segments from disk that are no longer needed. This is determined - /// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn. - pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> { - if self.is_cancelled() { - bail!(TimelineError::Cancelled(self.ttid)); - } - - let horizon_segno: XLogSegNo; - let remover = { - let shared_state = self.write_shared_state().await; - horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled); - if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno { - return Ok(()); // nothing to do - } - - // release the lock before removing - shared_state.sk.wal_store.remove_up_to(horizon_segno - 1) - }; - - // delete old WAL files - remover.await?; - - // update last_removed_segno - let mut shared_state = self.write_shared_state().await; - shared_state.last_removed_segno = horizon_segno; - Ok(()) - } - - /// Persist control file if there is something to save and enough time - /// passed after the last save. This helps to keep remote_consistent_lsn up - /// to date so that storage nodes restart doesn't cause many pageserver -> - /// safekeeper reconnections. - pub async fn maybe_persist_control_file(&self) -> Result<()> { - let remote_consistent_lsn = self.walsenders.get_remote_consistent_lsn(); - self.write_shared_state() - .await - .sk - .maybe_persist_inmem_control_file(remote_consistent_lsn) - .await - } - - /// Gather timeline data for metrics. If the timeline is not active, returns - /// None, we do not collect these. + /// Gather timeline data for metrics. pub async fn info_for_metrics(&self) -> Option { if self.is_cancelled() { return None; } - let ps_feedback = self.walsenders.get_ps_feedback(); - let state = self.write_shared_state().await; - if state.active { - Some(FullTimelineInfo { - ttid: self.ttid, - ps_feedback, - wal_backup_active: state.wal_backup_active, - timeline_is_active: state.active, - num_computes: self.walreceivers.get_num() as u32, - last_removed_segno: state.last_removed_segno, - epoch_start_lsn: state.sk.epoch_start_lsn, - mem_state: state.sk.inmem.clone(), - persisted_state: state.sk.state.clone(), - flush_lsn: state.sk.wal_store.flush_lsn(), - remote_consistent_lsn: self.get_walsenders().get_remote_consistent_lsn(), - wal_storage: state.sk.wal_store.get_metrics(), - }) - } else { - None - } + let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats(); + let state = self.read_shared_state().await; + Some(FullTimelineInfo { + ttid: self.ttid, + ps_feedback_count, + last_ps_feedback, + wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), + timeline_is_active: self.broker_active.load(Ordering::Relaxed), + num_computes: self.walreceivers.get_num() as u32, + last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), + epoch_start_lsn: state.sk.term_start_lsn, + mem_state: state.sk.state.inmem.clone(), + persisted_state: state.sk.state.clone(), + flush_lsn: state.sk.wal_store.flush_lsn(), + wal_storage: state.sk.wal_store.get_metrics(), + }) } /// Returns in-memory timeline state to build a full debug dump. pub async fn memory_dump(&self) -> debug_dump::Memory { - let state = self.write_shared_state().await; + let state = self.read_shared_state().await; let (write_lsn, write_record_lsn, flush_lsn, file_open) = state.sk.wal_store.internal_state(); @@ -884,18 +642,136 @@ impl Timeline { is_cancelled: self.is_cancelled(), peers_info_len: state.peers_info.0.len(), walsenders: self.walsenders.get_all(), - wal_backup_active: state.wal_backup_active, - active: state.active, + wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed), + active: self.broker_active.load(Ordering::Relaxed), num_computes: self.walreceivers.get_num() as u32, - last_removed_segno: state.last_removed_segno, - epoch_start_lsn: state.sk.epoch_start_lsn, - mem_state: state.sk.inmem.clone(), + last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed), + epoch_start_lsn: state.sk.term_start_lsn, + mem_state: state.sk.state.inmem.clone(), write_lsn, write_record_lsn, flush_lsn, file_open, } } + + /// Apply a function to the control file state and persist it. + pub async fn map_control_file( + self: &Arc, + f: impl FnOnce(&mut TimelinePersistentState) -> Result, + ) -> Result { + let mut state = self.write_shared_state().await; + let mut persistent_state = state.sk.state.start_change(); + // If f returns error, we abort the change and don't persist anything. + let res = f(&mut persistent_state)?; + // If persisting fails, we abort the change and return error. + state.sk.state.finish_change(&persistent_state).await?; + Ok(res) + } + + /// Get the timeline guard for reading/writing WAL files. + /// TODO: if WAL files are not present on disk (evicted), they will be + /// downloaded from S3. Also there will logic for preventing eviction + /// while someone is holding FullAccessTimeline guard. + pub async fn full_access_guard(self: &Arc) -> Result { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + Ok(FullAccessTimeline { tli: self.clone() }) + } +} + +/// This is a guard that allows to read/write disk timeline state. +/// All tasks that are using the disk should use this guard. +#[derive(Clone)] +pub struct FullAccessTimeline { + pub tli: Arc, +} + +impl Deref for FullAccessTimeline { + type Target = Arc; + + fn deref(&self) -> &Self::Target { + &self.tli + } +} + +impl FullAccessTimeline { + /// Returns true if walsender should stop sending WAL to pageserver. We + /// terminate it if remote_consistent_lsn reached commit_lsn and there is no + /// computes. While there might be nothing to stream already, we learn about + /// remote_consistent_lsn update through replication feedback, and we want + /// to stop pushing to the broker if pageserver is fully caughtup. + pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool { + if self.is_cancelled() { + return true; + } + let shared_state = self.read_shared_state().await; + if self.walreceivers.get_num() == 0 { + return shared_state.sk.state.inmem.commit_lsn == Lsn(0) || // no data at all yet + reported_remote_consistent_lsn >= shared_state.sk.state.inmem.commit_lsn; + } + false + } + + /// Ensure that current term is t, erroring otherwise, and lock the state. + pub async fn acquire_term(&self, t: Term) -> Result { + let ss = self.read_shared_state().await; + if ss.sk.state.acceptor_state.term != t { + bail!( + "failed to acquire term {}, current term {}", + t, + ss.sk.state.acceptor_state.term + ); + } + Ok(ss) + } + + /// Pass arrived message to the safekeeper. + pub async fn process_msg( + &self, + msg: &ProposerAcceptorMessage, + ) -> Result> { + if self.is_cancelled() { + bail!(TimelineError::Cancelled(self.ttid)); + } + + let mut rmsg: Option; + { + let mut shared_state = self.write_shared_state().await; + rmsg = shared_state.sk.process_msg(msg).await?; + + // if this is AppendResponse, fill in proper hot standby feedback. + if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg { + resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback; + } + } + Ok(rmsg) + } + + pub async fn get_walreader(&self, start_lsn: Lsn) -> Result { + let (_, persisted_state) = self.get_state().await; + let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled(); + + WalReader::new( + &self.ttid, + self.timeline_dir.clone(), + &persisted_state, + start_lsn, + enable_remote_read, + ) + } + + pub fn get_timeline_dir(&self) -> Utf8PathBuf { + self.timeline_dir.clone() + } + + /// Update in memory remote consistent lsn. + pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) { + let mut shared_state = self.write_shared_state().await; + shared_state.sk.state.inmem.remote_consistent_lsn = + max(shared_state.sk.state.inmem.remote_consistent_lsn, candidate); + } } /// Deletes directory and it's contents. Returns false if directory does not exist. @@ -906,3 +782,16 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result { Err(e) => Err(e.into()), } } + +/// Get a path to the tenant directory. If you just need to get a timeline directory, +/// use FullAccessTimeline::get_timeline_dir instead. +pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf { + conf.workdir.join(tenant_id.to_string()) +} + +/// Get a path to the timeline directory. If you need to read WAL files from disk, +/// use FullAccessTimeline::get_timeline_dir instead. This function does not check +/// timeline eviction status and WAL files might not be present on disk. +pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf { + get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string()) +} diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs new file mode 100644 index 0000000000..592426bba3 --- /dev/null +++ b/safekeeper/src/timeline_manager.rs @@ -0,0 +1,386 @@ +//! The timeline manager task is responsible for managing the timeline's background tasks. +//! It is spawned alongside each timeline and exits when the timeline is deleted. +//! It watches for changes in the timeline state and decides when to spawn or kill background tasks. +//! It also can manage some reactive state, like should the timeline be active for broker pushes or not. + +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; + +use postgres_ffi::XLogSegNo; +use tokio::task::{JoinError, JoinHandle}; +use tracing::{info, info_span, instrument, warn, Instrument}; +use utils::lsn::Lsn; + +use crate::{ + control_file::Storage, + metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL}, + recovery::recovery_main, + remove_wal::calc_horizon_lsn, + send_wal::WalSenders, + timeline::{PeerInfo, ReadGuardSharedState, Timeline}, + timelines_set::{TimelineSetGuard, TimelinesSet}, + wal_backup::{self, WalBackupTaskHandle}, + wal_backup_partial, SafeKeeperConf, +}; + +pub struct StateSnapshot { + // inmem values + pub commit_lsn: Lsn, + pub backup_lsn: Lsn, + pub remote_consistent_lsn: Lsn, + + // persistent control file values + pub cfile_peer_horizon_lsn: Lsn, + pub cfile_remote_consistent_lsn: Lsn, + pub cfile_backup_lsn: Lsn, + + // misc + pub cfile_last_persist_at: Instant, + pub inmem_flush_pending: bool, + pub wal_removal_on_hold: bool, + pub peers: Vec, +} + +impl StateSnapshot { + /// Create a new snapshot of the timeline state. + fn new(read_guard: ReadGuardSharedState, heartbeat_timeout: Duration) -> Self { + Self { + commit_lsn: read_guard.sk.state.inmem.commit_lsn, + backup_lsn: read_guard.sk.state.inmem.backup_lsn, + remote_consistent_lsn: read_guard.sk.state.inmem.remote_consistent_lsn, + cfile_peer_horizon_lsn: read_guard.sk.state.peer_horizon_lsn, + cfile_remote_consistent_lsn: read_guard.sk.state.remote_consistent_lsn, + cfile_backup_lsn: read_guard.sk.state.backup_lsn, + cfile_last_persist_at: read_guard.sk.state.pers.last_persist_at(), + inmem_flush_pending: Self::has_unflushed_inmem_state(&read_guard), + wal_removal_on_hold: read_guard.wal_removal_on_hold, + peers: read_guard.get_peers(heartbeat_timeout), + } + } + + fn has_unflushed_inmem_state(read_guard: &ReadGuardSharedState) -> bool { + let state = &read_guard.sk.state; + state.inmem.commit_lsn > state.commit_lsn + || state.inmem.backup_lsn > state.backup_lsn + || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn + || state.inmem.remote_consistent_lsn > state.remote_consistent_lsn + } +} + +/// Control how often the manager task should wake up to check updates. +/// There is no need to check for updates more often than this. +const REFRESH_INTERVAL: Duration = Duration::from_millis(300); + +/// How often to save the control file if the is no other activity. +const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300); + +/// This task gets spawned alongside each timeline and is responsible for managing the timeline's +/// background tasks. +/// Be careful, this task is not respawned on panic, so it should not panic. +#[instrument(name = "manager", skip_all, fields(ttid = %tli.ttid))] +pub async fn main_task( + tli: Arc, + conf: SafeKeeperConf, + broker_active_set: Arc, +) { + scopeguard::defer! { + if tli.is_cancelled() { + info!("manager task finished"); + } else { + warn!("manager task finished prematurely"); + } + }; + + // configuration & dependencies + let wal_seg_size = tli.get_wal_seg_size().await; + let heartbeat_timeout = conf.heartbeat_timeout; + let walsenders = tli.get_walsenders(); + let walreceivers = tli.get_walreceivers(); + + // current state + let mut state_version_rx = tli.get_state_version_rx(); + let mut num_computes_rx = walreceivers.get_num_rx(); + let mut tli_broker_active = broker_active_set.guard(tli.clone()); + let mut last_removed_segno = 0 as XLogSegNo; + + // list of background tasks + let mut backup_task: Option = None; + let mut recovery_task: Option> = None; + let mut partial_backup_task: Option> = None; + let mut wal_removal_task: Option>> = None; + + // Start recovery task which always runs on the timeline. + if conf.peer_recovery_enabled { + match tli.full_access_guard().await { + Ok(tli) => { + recovery_task = Some(tokio::spawn(recovery_main(tli, conf.clone()))); + } + Err(e) => { + warn!("failed to start recovery task: {:?}", e); + } + } + } + + // Start partial backup task which always runs on the timeline. + if conf.is_wal_backup_enabled() && conf.partial_backup_enabled { + match tli.full_access_guard().await { + Ok(tli) => { + partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task( + tli, + conf.clone(), + ))); + } + Err(e) => { + warn!("failed to start partial backup task: {:?}", e); + } + } + } + + let last_state = 'outer: loop { + MANAGER_ITERATIONS_TOTAL.inc(); + + let state_snapshot = StateSnapshot::new(tli.read_shared_state().await, heartbeat_timeout); + let num_computes = *num_computes_rx.borrow(); + + let is_wal_backup_required = update_backup( + &conf, + &tli, + wal_seg_size, + num_computes, + &state_snapshot, + &mut backup_task, + ) + .await; + + let _is_active = update_is_active( + is_wal_backup_required, + num_computes, + &state_snapshot, + &mut tli_broker_active, + &tli, + ); + + let next_cfile_save = update_control_file_save(&state_snapshot, &tli).await; + + update_wal_removal( + &conf, + walsenders, + &tli, + wal_seg_size, + &state_snapshot, + last_removed_segno, + &mut wal_removal_task, + ) + .await; + + // wait until something changes. tx channels are stored under Arc, so they will not be + // dropped until the manager task is finished. + tokio::select! { + _ = tli.cancel.cancelled() => { + // timeline was deleted + break 'outer state_snapshot; + } + _ = async { + // don't wake up on every state change, but at most every REFRESH_INTERVAL + tokio::time::sleep(REFRESH_INTERVAL).await; + let _ = state_version_rx.changed().await; + } => { + // state was updated + } + _ = num_computes_rx.changed() => { + // number of connected computes was updated + } + _ = async { + if let Some(timeout) = next_cfile_save { + tokio::time::sleep_until(timeout).await + } else { + futures::future::pending().await + } + } => { + // it's time to save the control file + } + res = async { + if let Some(task) = &mut wal_removal_task { + task.await + } else { + futures::future::pending().await + } + } => { + // WAL removal task finished + wal_removal_task = None; + update_wal_removal_end(res, &tli, &mut last_removed_segno); + } + } + }; + + // remove timeline from the broker active set sooner, before waiting for background tasks + tli_broker_active.set(false); + + // shutdown background tasks + if conf.is_wal_backup_enabled() { + wal_backup::update_task(&conf, &tli, false, &last_state, &mut backup_task).await; + } + + if let Some(recovery_task) = recovery_task { + if let Err(e) = recovery_task.await { + warn!("recovery task failed: {:?}", e); + } + } + + if let Some(partial_backup_task) = partial_backup_task { + if let Err(e) = partial_backup_task.await { + warn!("partial backup task failed: {:?}", e); + } + } + + if let Some(wal_removal_task) = wal_removal_task { + let res = wal_removal_task.await; + update_wal_removal_end(res, &tli, &mut last_removed_segno); + } +} + +/// Spawns/kills backup task and returns true if backup is required. +async fn update_backup( + conf: &SafeKeeperConf, + tli: &Arc, + wal_seg_size: usize, + num_computes: usize, + state: &StateSnapshot, + backup_task: &mut Option, +) -> bool { + let is_wal_backup_required = + wal_backup::is_wal_backup_required(wal_seg_size, num_computes, state); + + if conf.is_wal_backup_enabled() { + wal_backup::update_task(conf, tli, is_wal_backup_required, state, backup_task).await; + } + + // update the state in Arc + tli.wal_backup_active + .store(backup_task.is_some(), std::sync::atomic::Ordering::Relaxed); + is_wal_backup_required +} + +/// Update is_active flag and returns its value. +fn update_is_active( + is_wal_backup_required: bool, + num_computes: usize, + state: &StateSnapshot, + tli_broker_active: &mut TimelineSetGuard, + tli: &Arc, +) -> bool { + let is_active = is_wal_backup_required + || num_computes > 0 + || state.remote_consistent_lsn < state.commit_lsn; + + // update the broker timeline set + if tli_broker_active.set(is_active) { + // write log if state has changed + info!( + "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}", + is_active, state.remote_consistent_lsn, state.commit_lsn, + ); + + MANAGER_ACTIVE_CHANGES.inc(); + } + + // update the state in Arc + tli.broker_active + .store(is_active, std::sync::atomic::Ordering::Relaxed); + is_active +} + +/// Save control file if needed. Returns Instant if we should persist the control file in the future. +async fn update_control_file_save( + state: &StateSnapshot, + tli: &Arc, +) -> Option { + if !state.inmem_flush_pending { + return None; + } + + if state.cfile_last_persist_at.elapsed() > CF_SAVE_INTERVAL { + let mut write_guard = tli.write_shared_state().await; + // this can be done in the background because it blocks manager task, but flush() should + // be fast enough not to be a problem now + if let Err(e) = write_guard.sk.state.flush().await { + warn!("failed to save control file: {:?}", e); + } + + None + } else { + // we should wait until next CF_SAVE_INTERVAL + Some((state.cfile_last_persist_at + CF_SAVE_INTERVAL).into()) + } +} + +/// Spawns WAL removal task if needed. +async fn update_wal_removal( + conf: &SafeKeeperConf, + walsenders: &Arc, + tli: &Arc, + wal_seg_size: usize, + state: &StateSnapshot, + last_removed_segno: u64, + wal_removal_task: &mut Option>>, +) { + if wal_removal_task.is_some() || state.wal_removal_on_hold { + // WAL removal is already in progress or hold off + return; + } + + // If enabled, we use LSN of the most lagging walsender as a WAL removal horizon. + // This allows to get better read speed for pageservers that are lagging behind, + // at the cost of keeping more WAL on disk. + let replication_horizon_lsn = if conf.walsenders_keep_horizon { + walsenders.laggard_lsn() + } else { + None + }; + + let removal_horizon_lsn = calc_horizon_lsn(state, replication_horizon_lsn); + let removal_horizon_segno = removal_horizon_lsn + .segment_number(wal_seg_size) + .saturating_sub(1); + + if removal_horizon_segno > last_removed_segno { + // we need to remove WAL + let remover = crate::wal_storage::Storage::remove_up_to( + &tli.read_shared_state().await.sk.wal_store, + removal_horizon_segno, + ); + *wal_removal_task = Some(tokio::spawn( + async move { + remover.await?; + Ok(removal_horizon_segno) + } + .instrument(info_span!("WAL removal", ttid=%tli.ttid)), + )); + } +} + +/// Update the state after WAL removal task finished. +fn update_wal_removal_end( + res: Result, JoinError>, + tli: &Arc, + last_removed_segno: &mut u64, +) { + let new_last_removed_segno = match res { + Ok(Ok(segno)) => segno, + Err(e) => { + warn!("WAL removal task failed: {:?}", e); + return; + } + Ok(Err(e)) => { + warn!("WAL removal task failed: {:?}", e); + return; + } + }; + + *last_removed_segno = new_last_removed_segno; + // update the state in Arc + tli.last_removed_segno + .store(new_last_removed_segno, std::sync::atomic::Ordering::Relaxed); +} diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 92ac5ba66d..45e08ede3c 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -3,7 +3,8 @@ //! all from the disk on startup and keeping them in memory. use crate::safekeeper::ServerInfo; -use crate::timeline::{Timeline, TimelineError}; +use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; +use crate::timelines_set::TimelinesSet; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; @@ -11,16 +12,16 @@ use once_cell::sync::Lazy; use serde::Serialize; use std::collections::HashMap; use std::str::FromStr; +use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; -use tokio::sync::mpsc::Sender; use tracing::*; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; struct GlobalTimelinesState { timelines: HashMap>, - wal_backup_launcher_tx: Option>, conf: Option, + broker_active_set: Arc, load_lock: Arc>, } @@ -36,11 +37,8 @@ impl GlobalTimelinesState { } /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (SafeKeeperConf, Sender) { - ( - self.get_conf().clone(), - self.wal_backup_launcher_tx.as_ref().unwrap().clone(), - ) + fn get_dependencies(&self) -> (SafeKeeperConf, Arc) { + (self.get_conf().clone(), self.broker_active_set.clone()) } /// Insert timeline into the map. Returns error if timeline with the same id already exists. @@ -65,8 +63,8 @@ impl GlobalTimelinesState { static TIMELINES_STATE: Lazy> = Lazy::new(|| { Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), - wal_backup_launcher_tx: None, conf: None, + broker_active_set: Arc::new(TimelinesSet::default()), load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), }) }); @@ -76,16 +74,11 @@ pub struct GlobalTimelines; impl GlobalTimelines { /// Inject dependencies needed for the timeline constructors and load all timelines to memory. - pub async fn init( - conf: SafeKeeperConf, - wal_backup_launcher_tx: Sender, - ) -> Result<()> { + pub async fn init(conf: SafeKeeperConf) -> Result<()> { // clippy isn't smart enough to understand that drop(state) releases the // lock, so use explicit block let tenants_dir = { let mut state = TIMELINES_STATE.lock().unwrap(); - assert!(state.wal_backup_launcher_tx.is_none()); - state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx); state.conf = Some(conf); // Iterate through all directories and load tenants for all directories @@ -129,15 +122,12 @@ impl GlobalTimelines { /// this function is called during init when nothing else is running, so /// this is fine. async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> { - let (conf, wal_backup_launcher_tx) = { + let (conf, broker_active_set) = { let state = TIMELINES_STATE.lock().unwrap(); - ( - state.get_conf().clone(), - state.wal_backup_launcher_tx.as_ref().unwrap().clone(), - ) + state.get_dependencies() }; - let timelines_dir = conf.tenant_dir(&tenant_id); + let timelines_dir = get_tenant_dir(&conf, &tenant_id); for timelines_dir_entry in std::fs::read_dir(&timelines_dir) .with_context(|| format!("failed to list timelines dir {}", timelines_dir))? { @@ -147,7 +137,7 @@ impl GlobalTimelines { TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) { let ttid = TenantTimelineId::new(tenant_id, timeline_id); - match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) { + match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); TIMELINES_STATE @@ -155,8 +145,7 @@ impl GlobalTimelines { .unwrap() .timelines .insert(ttid, tli.clone()); - tli.bootstrap(&conf); - tli.update_status_notify().await.unwrap(); + tli.bootstrap(&conf, broker_active_set.clone()); } // If we can't load a timeline, it's most likely because of a corrupted // directory. We will log an error and won't allow to delete/recreate @@ -189,9 +178,9 @@ impl GlobalTimelines { _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>, ttid: TenantTimelineId, ) -> Result> { - let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies(); + let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies(); - match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) { + match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); @@ -202,7 +191,7 @@ impl GlobalTimelines { .timelines .insert(ttid, tli.clone()); - tli.bootstrap(&conf); + tli.bootstrap(&conf, broker_active_set); Ok(tli) } @@ -221,6 +210,10 @@ impl GlobalTimelines { TIMELINES_STATE.lock().unwrap().get_conf().clone() } + pub fn get_global_broker_active_set() -> Arc { + TIMELINES_STATE.lock().unwrap().broker_active_set.clone() + } + /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. pub async fn create( @@ -229,7 +222,7 @@ impl GlobalTimelines { commit_lsn: Lsn, local_start_lsn: Lsn, ) -> Result> { - let (conf, wal_backup_launcher_tx) = { + let (conf, broker_active_set) = { let state = TIMELINES_STATE.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. @@ -243,7 +236,6 @@ impl GlobalTimelines { let timeline = Arc::new(Timeline::create_empty( &conf, ttid, - wal_backup_launcher_tx, server_info, commit_lsn, local_start_lsn, @@ -264,7 +256,10 @@ impl GlobalTimelines { // Write the new timeline to the disk and start background workers. // Bootstrap is transactional, so if it fails, the timeline will be deleted, // and the state on disk should remain unchanged. - if let Err(e) = timeline.init_new(&mut shared_state, &conf).await { + if let Err(e) = timeline + .init_new(&mut shared_state, &conf, broker_active_set) + .await + { // Note: the most likely reason for init failure is that the timeline // directory already exists on disk. This happens when timeline is corrupted // and wasn't loaded from disk on startup because of that. We want to preserve @@ -281,8 +276,6 @@ impl GlobalTimelines { // We are done with bootstrap, release the lock, return the timeline. // {} block forces release before .await } - timeline.update_status_notify().await?; - timeline.wal_backup_launcher_tx.send(timeline.ttid).await?; Ok(timeline) } @@ -327,16 +320,21 @@ impl GlobalTimelines { } /// Cancels timeline, then deletes the corresponding data directory. - pub async fn delete_force(ttid: &TenantTimelineId) -> Result { + /// If only_local, doesn't remove WAL segments in remote storage. + pub async fn delete( + ttid: &TenantTimelineId, + only_local: bool, + ) -> Result { let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); match tli_res { Ok(timeline) => { + let was_active = timeline.broker_active.load(Ordering::Relaxed); + // Take a lock and finish the deletion holding this mutex. let mut shared_state = timeline.write_shared_state().await; - info!("deleting timeline {}", ttid); - let (dir_existed, was_active) = - timeline.delete_from_disk(&mut shared_state).await?; + info!("deleting timeline {}, only_local={}", ttid, only_local); + let dir_existed = timeline.delete(&mut shared_state, only_local).await?; // Remove timeline from the map. // FIXME: re-enable it once we fix the issue with recreation of deleted timelines @@ -345,16 +343,12 @@ impl GlobalTimelines { Ok(TimelineDeleteForceResult { dir_existed, - was_active, + was_active, // TODO: we probably should remove this field }) } Err(_) => { // Timeline is not memory, but it may still exist on disk in broken state. - let dir_path = TIMELINES_STATE - .lock() - .unwrap() - .get_conf() - .timeline_dir(ttid); + let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid); let dir_existed = delete_dir(dir_path)?; Ok(TimelineDeleteForceResult { @@ -369,8 +363,11 @@ impl GlobalTimelines { /// the tenant had, `true` if a timeline was active. There may be a race if new timelines are /// created simultaneously. In that case the function will return error and the caller should /// retry tenant deletion again later. + /// + /// If only_local, doesn't remove WAL segments in remote storage. pub async fn delete_force_all_for_tenant( tenant_id: &TenantId, + only_local: bool, ) -> Result> { info!("deleting all timelines for tenant {}", tenant_id); let to_delete = Self::get_all_for_tenant(*tenant_id); @@ -379,7 +376,7 @@ impl GlobalTimelines { let mut deleted = HashMap::new(); for tli in &to_delete { - match Self::delete_force(&tli.ttid).await { + match Self::delete(&tli.ttid, only_local).await { Ok(result) => { deleted.insert(tli.ttid, result); } @@ -400,13 +397,10 @@ impl GlobalTimelines { // Note that we could concurrently create new timelines while we were deleting them, // so the directory may be not empty. In this case timelines will have bad state // and timeline background jobs can panic. - delete_dir( - TIMELINES_STATE - .lock() - .unwrap() - .get_conf() - .tenant_dir(tenant_id), - )?; + delete_dir(get_tenant_dir( + TIMELINES_STATE.lock().unwrap().get_conf(), + tenant_id, + ))?; // FIXME: we temporarily disabled removing timelines from the map, see `delete_force` // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); diff --git a/safekeeper/src/timelines_set.rs b/safekeeper/src/timelines_set.rs new file mode 100644 index 0000000000..ea8e23bb72 --- /dev/null +++ b/safekeeper/src/timelines_set.rs @@ -0,0 +1,90 @@ +use std::{collections::HashMap, sync::Arc}; + +use utils::id::TenantTimelineId; + +use crate::timeline::Timeline; + +/// Set of timelines, supports operations: +/// - add timeline +/// - remove timeline +/// - clone the set +/// +/// Usually used for keeping subset of timelines. For example active timelines that require broker push. +pub struct TimelinesSet { + timelines: std::sync::Mutex>>, +} + +impl Default for TimelinesSet { + fn default() -> Self { + Self { + timelines: std::sync::Mutex::new(HashMap::new()), + } + } +} + +impl TimelinesSet { + pub fn insert(&self, tli: Arc) { + self.timelines.lock().unwrap().insert(tli.ttid, tli); + } + + pub fn delete(&self, ttid: &TenantTimelineId) { + self.timelines.lock().unwrap().remove(ttid); + } + + /// If present is true, adds timeline to the set, otherwise removes it. + pub fn set_present(&self, tli: Arc, present: bool) { + if present { + self.insert(tli); + } else { + self.delete(&tli.ttid); + } + } + + pub fn is_present(&self, ttid: &TenantTimelineId) -> bool { + self.timelines.lock().unwrap().contains_key(ttid) + } + + /// Returns all timelines in the set. + pub fn get_all(&self) -> Vec> { + self.timelines.lock().unwrap().values().cloned().collect() + } + + /// Returns a timeline guard for easy presence control. + pub fn guard(self: &Arc, tli: Arc) -> TimelineSetGuard { + let is_present = self.is_present(&tli.ttid); + TimelineSetGuard { + timelines_set: self.clone(), + tli, + is_present, + } + } +} + +/// Guard is used to add or remove timeline from the set. +/// If the timeline present in set, it will be removed from it on drop. +/// Note: do not use more than one guard for the same timeline, it caches the presence state. +/// It is designed to be used in the manager task only. +pub struct TimelineSetGuard { + timelines_set: Arc, + tli: Arc, + is_present: bool, +} + +impl TimelineSetGuard { + /// Returns true if the state was changed. + pub fn set(&mut self, present: bool) -> bool { + if present == self.is_present { + return false; + } + self.is_present = present; + self.timelines_set.set_present(self.tli.clone(), present); + true + } +} + +impl Drop for TimelineSetGuard { + fn drop(&mut self) { + // remove timeline from the map on drop + self.timelines_set.delete(&self.tli.ttid); + } +} diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index e4499eaf50..58591aecfa 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -4,10 +4,13 @@ use camino::{Utf8Path, Utf8PathBuf}; use futures::stream::FuturesOrdered; use futures::StreamExt; use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; +use utils::backoff; use utils::id::NodeId; use std::cmp::min; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; +use std::num::NonZeroU32; use std::pin::Pin; use std::sync::Arc; use std::time::Duration; @@ -15,7 +18,7 @@ use std::time::Duration; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; -use remote_storage::{GenericRemoteStorage, RemotePath}; +use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata}; use tokio::fs::File; use tokio::select; @@ -26,9 +29,10 @@ use tracing::*; use utils::{id::TenantTimelineId, lsn::Lsn}; -use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS}; -use crate::timeline::{PeerInfo, Timeline}; -use crate::{GlobalTimelines, SafeKeeperConf}; +use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS}; +use crate::timeline::{FullAccessTimeline, PeerInfo, Timeline}; +use crate::timeline_manager::StateSnapshot; +use crate::{SafeKeeperConf, WAL_BACKUP_RUNTIME}; use once_cell::sync::OnceCell; @@ -38,35 +42,77 @@ const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000; /// Default buffer size when interfacing with [`tokio::fs::File`]. const BUFFER_SIZE: usize = 32 * 1024; -/// Check whether wal backup is required for timeline. If yes, mark that launcher is -/// aware of current status and return the timeline. -async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option> { - match GlobalTimelines::get(ttid).ok() { - Some(tli) => { - tli.wal_backup_attend().await; - Some(tli) - } - None => None, - } -} - -struct WalBackupTaskHandle { +pub struct WalBackupTaskHandle { shutdown_tx: Sender<()>, handle: JoinHandle<()>, } -struct WalBackupTimelineEntry { - timeline: Arc, - handle: Option, +/// Do we have anything to upload to S3, i.e. should safekeepers run backup activity? +pub fn is_wal_backup_required( + wal_seg_size: usize, + num_computes: usize, + state: &StateSnapshot, +) -> bool { + num_computes > 0 || + // Currently only the whole segment is offloaded, so compare segment numbers. + (state.commit_lsn.segment_number(wal_seg_size) > state.backup_lsn.segment_number(wal_seg_size)) } -async fn shut_down_task(ttid: TenantTimelineId, entry: &mut WalBackupTimelineEntry) { - if let Some(wb_handle) = entry.handle.take() { +/// Based on peer information determine which safekeeper should offload; if it +/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task +/// is running, kill it. +pub async fn update_task( + conf: &SafeKeeperConf, + tli: &Arc, + need_backup: bool, + state: &StateSnapshot, + entry: &mut Option, +) { + let (offloader, election_dbg_str) = + determine_offloader(&state.peers, state.backup_lsn, tli.ttid, conf); + let elected_me = Some(conf.my_id) == offloader; + + let should_task_run = need_backup && elected_me; + + // start or stop the task + if should_task_run != (entry.is_some()) { + if should_task_run { + info!("elected for backup: {}", election_dbg_str); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + + let async_task = backup_task_main(tli.clone(), conf.backup_parallel_jobs, shutdown_rx); + + let handle = if conf.current_thread_runtime { + tokio::spawn(async_task) + } else { + WAL_BACKUP_RUNTIME.spawn(async_task) + }; + + *entry = Some(WalBackupTaskHandle { + shutdown_tx, + handle, + }); + } else { + if !need_backup { + // don't need backup at all + info!("stepping down from backup, need_backup={}", need_backup); + } else { + // someone else has been elected + info!("stepping down from backup: {}", election_dbg_str); + } + shut_down_task(entry).await; + } + } +} + +async fn shut_down_task(entry: &mut Option) { + if let Some(wb_handle) = entry.take() { // Tell the task to shutdown. Error means task exited earlier, that's ok. let _ = wb_handle.shutdown_tx.send(()).await; // Await the task itself. TODO: restart panicked tasks earlier. if let Err(e) = wb_handle.handle.await { - warn!("WAL backup task for {} panicked: {}", ttid, e); + warn!("WAL backup task panicked: {}", e); } } } @@ -123,149 +169,54 @@ fn determine_offloader( } } -/// Based on peer information determine which safekeeper should offload; if it -/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task -/// is running, kill it. -async fn update_task( - conf: &SafeKeeperConf, - ttid: TenantTimelineId, - entry: &mut WalBackupTimelineEntry, -) { - let alive_peers = entry.timeline.get_peers(conf).await; - let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await; - let (offloader, election_dbg_str) = - determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf); - let elected_me = Some(conf.my_id) == offloader; +static REMOTE_STORAGE: OnceCell> = OnceCell::new(); - if elected_me != (entry.handle.is_some()) { - if elected_me { - info!("elected for backup: {}", election_dbg_str); - - let (shutdown_tx, shutdown_rx) = mpsc::channel(1); - let timeline_dir = conf.timeline_dir(&ttid); - - let handle = tokio::spawn( - backup_task_main( - ttid, - timeline_dir, - conf.workdir.clone(), - conf.backup_parallel_jobs, - shutdown_rx, - ) - .in_current_span(), - ); - - entry.handle = Some(WalBackupTaskHandle { - shutdown_tx, - handle, - }); - } else { - info!("stepping down from backup: {}", election_dbg_str); - shut_down_task(ttid, entry).await; - } - } +// Storage must be configured and initialized when this is called. +fn get_configured_remote_storage() -> &'static GenericRemoteStorage { + REMOTE_STORAGE + .get() + .expect("failed to get remote storage") + .as_ref() + .unwrap() } -const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; - -/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup -/// tasks. Having this in separate task simplifies locking, allows to reap -/// panics and separate elections from offloading itself. -pub async fn wal_backup_launcher_task_main( - conf: SafeKeeperConf, - mut wal_backup_launcher_rx: Receiver, -) -> anyhow::Result<()> { - info!( - "WAL backup launcher started, remote config {:?}", - conf.remote_storage - ); - - let conf_ = conf.clone(); +pub fn init_remote_storage(conf: &SafeKeeperConf) { + // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide + // dependencies to all tasks instead. REMOTE_STORAGE.get_or_init(|| { - conf_ - .remote_storage + conf.remote_storage .as_ref() .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) }); - - // Presence in this map means launcher is aware s3 offloading is needed for - // the timeline, but task is started only if it makes sense for to offload - // from this safekeeper. - let mut tasks: HashMap = HashMap::new(); - - let mut ticker = tokio::time::interval(Duration::from_millis(CHECK_TASKS_INTERVAL_MSEC)); - loop { - tokio::select! { - ttid = wal_backup_launcher_rx.recv() => { - // channel is never expected to get closed - let ttid = ttid.unwrap(); - if conf.remote_storage.is_none() || !conf.wal_backup_enabled { - continue; /* just drain the channel and do nothing */ - } - async { - let timeline = is_wal_backup_required(ttid).await; - // do we need to do anything at all? - if timeline.is_some() != tasks.contains_key(&ttid) { - if let Some(timeline) = timeline { - // need to start the task - let entry = tasks.entry(ttid).or_insert(WalBackupTimelineEntry { - timeline, - handle: None, - }); - update_task(&conf, ttid, entry).await; - } else { - // need to stop the task - info!("stopping WAL backup task"); - let mut entry = tasks.remove(&ttid).unwrap(); - shut_down_task(ttid, &mut entry).await; - } - } - }.instrument(info_span!("WAL backup", ttid = %ttid)).await; - } - // For each timeline needing offloading, check if this safekeeper - // should do the job and start/stop the task accordingly. - _ = ticker.tick() => { - for (ttid, entry) in tasks.iter_mut() { - update_task(&conf, *ttid, entry) - .instrument(info_span!("WAL backup", ttid = %ttid)) - .await; - } - } - } - } } struct WalBackupTask { - timeline: Arc, + timeline: FullAccessTimeline, timeline_dir: Utf8PathBuf, - workspace_dir: Utf8PathBuf, wal_seg_size: usize, parallel_jobs: usize, commit_lsn_watch_rx: watch::Receiver, } /// Offload single timeline. -async fn backup_task_main( - ttid: TenantTimelineId, - timeline_dir: Utf8PathBuf, - workspace_dir: Utf8PathBuf, - parallel_jobs: usize, - mut shutdown_rx: Receiver<()>, -) { +#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))] +async fn backup_task_main(tli: Arc, parallel_jobs: usize, mut shutdown_rx: Receiver<()>) { + let _guard = WAL_BACKUP_TASKS.guard(); + + let tli = match tli.full_access_guard().await { + Ok(tli) => tli, + Err(e) => { + error!("backup error: {}", e); + return; + } + }; info!("started"); - let res = GlobalTimelines::get(ttid); - if let Err(e) = res { - error!("backup error: {}", e); - return; - } - let tli = res.unwrap(); let mut wb = WalBackupTask { wal_seg_size: tli.get_wal_seg_size().await, commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(), + timeline_dir: tli.get_timeline_dir(), timeline: tli, - timeline_dir, - workspace_dir, parallel_jobs, }; @@ -332,7 +283,6 @@ impl WalBackupTask { commit_lsn, self.wal_seg_size, &self.timeline_dir, - &self.workspace_dir, self.parallel_jobs, ) .await @@ -354,18 +304,18 @@ impl WalBackupTask { } async fn backup_lsn_range( - timeline: &Arc, + timeline: &FullAccessTimeline, backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, timeline_dir: &Utf8Path, - workspace_dir: &Utf8Path, parallel_jobs: usize, ) -> Result<()> { if parallel_jobs < 1 { anyhow::bail!("parallel_jobs must be >= 1"); } + let remote_timeline_path = remote_timeline_path(&timeline.ttid)?; let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); @@ -378,7 +328,11 @@ async fn backup_lsn_range( loop { let added_task = match iter.next() { Some(s) => { - uploads.push_back(backup_single_segment(s, timeline_dir, workspace_dir)); + uploads.push_back(backup_single_segment( + s, + timeline_dir, + &remote_timeline_path, + )); true } None => false, @@ -416,18 +370,10 @@ async fn backup_lsn_range( async fn backup_single_segment( seg: &Segment, timeline_dir: &Utf8Path, - workspace_dir: &Utf8Path, + remote_timeline_path: &RemotePath, ) -> Result { let segment_file_path = seg.file_path(timeline_dir)?; - let remote_segment_path = segment_file_path - .strip_prefix(workspace_dir) - .context("Failed to strip workspace dir prefix") - .and_then(RemotePath::new) - .with_context(|| { - format!( - "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}", - ) - })?; + let remote_segment_path = seg.remote_path(remote_timeline_path); let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await; if res.is_ok() { @@ -465,6 +411,10 @@ impl Segment { Ok(timeline_dir.join(self.object_name())) } + pub fn remote_path(self, remote_timeline_path: &RemotePath) -> RemotePath { + remote_timeline_path.join(self.object_name()) + } + pub fn size(self) -> usize { (u64::from(self.end_lsn) - u64::from(self.start_lsn)) as usize } @@ -484,18 +434,12 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { res } -static REMOTE_STORAGE: OnceCell> = OnceCell::new(); - async fn backup_object( source_file: &Utf8Path, target_file: &RemotePath, size: usize, ) -> Result<()> { - let storage = REMOTE_STORAGE - .get() - .expect("failed to get remote storage") - .as_ref() - .unwrap(); + let storage = get_configured_remote_storage(); let file = File::open(&source_file) .await @@ -503,7 +447,40 @@ async fn backup_object( let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE); - storage.upload_storage_object(file, size, target_file).await + let cancel = CancellationToken::new(); + + storage + .upload_storage_object(file, size, target_file, &cancel) + .await +} + +pub(crate) async fn backup_partial_segment( + source_file: &Utf8Path, + target_file: &RemotePath, + size: usize, +) -> Result<()> { + let storage = get_configured_remote_storage(); + + let file = File::open(&source_file) + .await + .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; + + // limiting the file to read only the first `size` bytes + let limited_file = tokio::io::AsyncReadExt::take(file, size as u64); + + let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE); + + let cancel = CancellationToken::new(); + + storage + .upload( + file, + size, + target_file, + Some(StorageMetadata::from([("sk_type", "partial_segment")])), + &cancel, + ) + .await } pub async fn read_object( @@ -518,8 +495,10 @@ pub async fn read_object( info!("segment download about to start from remote path {file_path:?} at offset {offset}"); + let cancel = CancellationToken::new(); + let download = storage - .download_storage_object(Some((offset, None)), file_path) + .download_storage_object(Some((offset, None)), file_path, &cancel) .await .with_context(|| { format!("Failed to open WAL segment download stream for remote path {file_path:?}") @@ -532,6 +511,76 @@ pub async fn read_object( Ok(Box::pin(reader)) } +/// Delete WAL files for the given timeline. Remote storage must be configured +/// when called. +pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { + let storage = get_configured_remote_storage(); + let remote_path = remote_timeline_path(ttid)?; + + // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE + // const Option unwrap is not stable, otherwise it would be const. + let batch_size: NonZeroU32 = NonZeroU32::new(1000).unwrap(); + + // A backoff::retry is used here for two reasons: + // - To provide a backoff rather than busy-polling the API on errors + // - To absorb transient 429/503 conditions without hitting our error + // logging path for issues deleting objects. + // + // Note: listing segments might take a long time if there are many of them. + // We don't currently have http requests timeout cancellation, but if/once + // we have listing should get streaming interface to make progress. + + let cancel = CancellationToken::new(); // not really used + backoff::retry( + || async { + // Do list-delete in batch_size batches to make progress even if there a lot of files. + // Alternatively we could make remote storage list return iterator, but it is more complicated and + // I'm not sure deleting while iterating is expected in s3. + loop { + let files = storage + .list( + Some(&remote_path), + ListingMode::NoDelimiter, + Some(batch_size), + &cancel, + ) + .await? + .keys; + if files.is_empty() { + return Ok(()); // done + } + // (at least) s3 results are sorted, so can log min/max: + // "List results are always returned in UTF-8 binary order." + info!( + "deleting batch of {} WAL segments [{}-{}]", + files.len(), + files.first().unwrap().object_name().unwrap_or(""), + files.last().unwrap().object_name().unwrap_or("") + ); + storage.delete_objects(&files, &cancel).await?; + } + }, + // consider TimeoutOrCancel::caused_by_cancel when using cancellation + |_| false, + 3, + 10, + "executing WAL segments deletion batch", + &cancel, + ) + .await + .ok_or_else(|| anyhow::anyhow!("canceled")) + .and_then(|x| x)?; + + Ok(()) +} + +/// Used by wal_backup_partial. +pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> { + let cancel = CancellationToken::new(); // not really used + let storage = get_configured_remote_storage(); + storage.delete_objects(paths, &cancel).await +} + /// Copy segments from one timeline to another. Used in copy_timeline. pub async fn copy_s3_segments( wal_seg_size: usize, @@ -548,12 +597,20 @@ pub async fn copy_s3_segments( .as_ref() .unwrap(); - let relative_dst_path = - Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string()); + let remote_dst_path = remote_timeline_path(dst_ttid)?; - let remote_path = RemotePath::new(&relative_dst_path)?; + let cancel = CancellationToken::new(); + + let files = storage + .list( + Some(&remote_dst_path), + ListingMode::NoDelimiter, + None, + &cancel, + ) + .await? + .keys; - let files = storage.list_files(Some(&remote_path)).await?; let uploaded_segments = &files .iter() .filter_map(|file| file.object_name().map(ToOwned::to_owned)) @@ -564,9 +621,6 @@ pub async fn copy_s3_segments( uploaded_segments ); - let relative_src_path = - Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string()); - for segno in from_segment..to_segment { if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 { info!("copied all segments from {} until {}", from_segment, segno); @@ -578,10 +632,10 @@ pub async fn copy_s3_segments( } debug!("copying segment {}", segment_name); - let from = RemotePath::new(&relative_src_path.join(&segment_name))?; - let to = RemotePath::new(&relative_dst_path.join(&segment_name))?; + let from = remote_timeline_path(src_ttid)?.join(&segment_name); + let to = remote_dst_path.join(&segment_name); - storage.copy_object(&from, &to).await?; + storage.copy_object(&from, &to, &cancel).await?; } info!( @@ -590,3 +644,8 @@ pub async fn copy_s3_segments( ); Ok(()) } + +/// Get S3 (remote_storage) prefix path used for timeline files. +pub fn remote_timeline_path(ttid: &TenantTimelineId) -> Result { + RemotePath::new(&Utf8Path::new(&ttid.tenant_id.to_string()).join(ttid.timeline_id.to_string())) +} diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs new file mode 100644 index 0000000000..ed5ddb71f5 --- /dev/null +++ b/safekeeper/src/wal_backup_partial.rs @@ -0,0 +1,406 @@ +//! Safekeeper timeline has a background task which is subscribed to `commit_lsn` +//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn` +//! was changed), the segment will be uploaded to S3 in about 15 minutes. +//! +//! The filename format for partial segments is +//! `Segment_Term_Flush_Commit_skNN.partial`, where: +//! - `Segment` – the segment name, like `000000010000000000000001` +//! - `Term` – current term +//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568` +//! - `Commit` – commit_lsn in the same hex format +//! - `NN` – safekeeper_id, like `1` +//! +//! The full object name example: +//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial` +//! +//! Each safekeeper will keep info about remote partial segments in its control +//! file. Code updates state in the control file before doing any S3 operations. +//! This way control file stores information about all potentially existing +//! remote partial segments and can clean them up after uploading a newer version. + +use camino::Utf8PathBuf; +use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use remote_storage::RemotePath; +use serde::{Deserialize, Serialize}; + +use tracing::{debug, error, info, instrument, warn}; +use utils::lsn::Lsn; + +use crate::{ + metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, + safekeeper::Term, + timeline::FullAccessTimeline, + wal_backup::{self, remote_timeline_path}, + SafeKeeperConf, +}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum UploadStatus { + /// Upload is in progress + InProgress, + /// Upload is finished + Uploaded, + /// Deletion is in progress + Deleting, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PartialRemoteSegment { + pub status: UploadStatus, + pub name: String, + pub commit_lsn: Lsn, + pub flush_lsn: Lsn, + pub term: Term, +} + +impl PartialRemoteSegment { + fn eq_without_status(&self, other: &Self) -> bool { + self.name == other.name + && self.commit_lsn == other.commit_lsn + && self.flush_lsn == other.flush_lsn + && self.term == other.term + } +} + +// NB: these structures are a part of a control_file, you can't change them without +// changing the control file format version. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +pub struct State { + pub segments: Vec, +} + +impl State { + /// Find an Uploaded segment. There should be only one Uploaded segment at a time. + fn uploaded_segment(&self) -> Option { + self.segments + .iter() + .find(|seg| seg.status == UploadStatus::Uploaded) + .cloned() + } +} + +struct PartialBackup { + wal_seg_size: usize, + tli: FullAccessTimeline, + conf: SafeKeeperConf, + local_prefix: Utf8PathBuf, + remote_timeline_path: RemotePath, + + state: State, +} + +// Read-only methods for getting segment names +impl PartialBackup { + fn segno(&self, lsn: Lsn) -> XLogSegNo { + lsn.segment_number(self.wal_seg_size) + } + + fn segment_name(&self, segno: u64) -> String { + XLogFileName(PG_TLI, segno, self.wal_seg_size) + } + + fn remote_segment_name( + &self, + segno: u64, + term: u64, + commit_lsn: Lsn, + flush_lsn: Lsn, + ) -> String { + format!( + "{}_{}_{:016X}_{:016X}_sk{}.partial", + self.segment_name(segno), + term, + flush_lsn.0, + commit_lsn.0, + self.conf.my_id.0, + ) + } + + fn local_segment_name(&self, segno: u64) -> String { + format!("{}.partial", self.segment_name(segno)) + } +} + +impl PartialBackup { + /// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded. + async fn prepare_upload(&self) -> PartialRemoteSegment { + // this operation takes a lock to get the actual state + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + let flush_lsn = Lsn(sk_info.flush_lsn); + let commit_lsn = Lsn(sk_info.commit_lsn); + let term = sk_info.term; + let segno = self.segno(flush_lsn); + + let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn); + + PartialRemoteSegment { + status: UploadStatus::InProgress, + name, + commit_lsn, + flush_lsn, + term, + } + } + + /// Reads segment from disk and uploads it to the remote storage. + async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> { + let flush_lsn = prepared.flush_lsn; + let segno = self.segno(flush_lsn); + + // We're going to backup bytes from the start of the segment up to flush_lsn. + let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size); + + let local_path = self.local_prefix.join(self.local_segment_name(segno)); + let remote_path = self.remote_timeline_path.join(&prepared.name); + + // Upload first `backup_bytes` bytes of the segment to the remote storage. + wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; + PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64); + + // We uploaded the segment, now let's verify that the data is still actual. + // If the term changed, we cannot guarantee the validity of the uploaded data. + // If the term is the same, we know the data is not corrupted. + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + if sk_info.term != prepared.term { + anyhow::bail!("term changed during upload"); + } + assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn)); + assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn)); + + Ok(()) + } + + /// Write new state to disk. If in-memory and on-disk states diverged, returns an error. + async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> { + self.tli + .map_control_file(|cf| { + if cf.partial_backup != self.state { + let memory = self.state.clone(); + self.state = cf.partial_backup.clone(); + anyhow::bail!( + "partial backup state diverged, memory={:?}, disk={:?}", + memory, + cf.partial_backup + ); + } + + cf.partial_backup = new_state.clone(); + Ok(()) + }) + .await?; + // update in-memory state + self.state = new_state; + Ok(()) + } + + /// Upload the latest version of the partial segment and garbage collect older versions. + #[instrument(name = "upload", skip_all, fields(name = %prepared.name))] + async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> { + info!("starting upload {:?}", prepared); + + let state_0 = self.state.clone(); + let state_1 = { + let mut state = state_0.clone(); + state.segments.push(prepared.clone()); + state + }; + + // we're going to upload a new segment, let's write it to disk to make GC later + self.commit_state(state_1).await?; + + self.upload_segment(prepared.clone()).await?; + + let state_2 = { + let mut state = state_0.clone(); + for seg in state.segments.iter_mut() { + seg.status = UploadStatus::Deleting; + } + let mut actual_remote_segment = prepared.clone(); + actual_remote_segment.status = UploadStatus::Uploaded; + state.segments.push(actual_remote_segment); + state + }; + + // we've uploaded new segment, it's actual, all other segments should be GCed + self.commit_state(state_2).await?; + self.gc().await?; + + Ok(()) + } + + /// Delete all non-Uploaded segments from the remote storage. There should be only one + /// Uploaded segment at a time. + #[instrument(name = "gc", skip_all)] + async fn gc(&mut self) -> anyhow::Result<()> { + let mut segments_to_delete = vec![]; + + let new_segments: Vec = self + .state + .segments + .iter() + .filter_map(|seg| { + if seg.status == UploadStatus::Uploaded { + Some(seg.clone()) + } else { + segments_to_delete.push(seg.name.clone()); + None + } + }) + .collect(); + + info!("deleting objects: {:?}", segments_to_delete); + let mut objects_to_delete = vec![]; + for seg in segments_to_delete.iter() { + let remote_path = self.remote_timeline_path.join(seg); + objects_to_delete.push(remote_path); + } + + // removing segments from remote storage + wal_backup::delete_objects(&objects_to_delete).await?; + + // now we can update the state on disk + let new_state = { + let mut state = self.state.clone(); + state.segments = new_segments; + state + }; + self.commit_state(new_state).await?; + + Ok(()) + } +} + +#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))] +pub async fn main_task(tli: FullAccessTimeline, conf: SafeKeeperConf) { + debug!("started"); + let await_duration = conf.partial_backup_timeout; + + let (_, persistent_state) = tli.get_state().await; + let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); + let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); + let wal_seg_size = tli.get_wal_seg_size().await; + + let local_prefix = tli.get_timeline_dir(); + let remote_timeline_path = match remote_timeline_path(&tli.ttid) { + Ok(path) => path, + Err(e) => { + error!("failed to create remote path: {:?}", e); + return; + } + }; + + let mut backup = PartialBackup { + wal_seg_size, + tli, + state: persistent_state.partial_backup, + conf, + local_prefix, + remote_timeline_path, + }; + + debug!("state: {:?}", backup.state); + + // The general idea is that each safekeeper keeps only one partial segment + // both in remote storage and in local state. If this is not true, something + // went wrong. + const MAX_SIMULTANEOUS_SEGMENTS: usize = 10; + + 'outer: loop { + if backup.state.segments.len() > MAX_SIMULTANEOUS_SEGMENTS { + warn!( + "too many segments in control_file state, running gc: {}", + backup.state.segments.len() + ); + + backup.gc().await.unwrap_or_else(|e| { + error!("failed to run gc: {:#}", e); + }); + } + + // wait until we have something to upload + let uploaded_segment = backup.state.uploaded_segment(); + if let Some(seg) = &uploaded_segment { + // if we already uploaded something, wait until we have something new + while flush_lsn_rx.borrow().lsn == seg.flush_lsn + && *commit_lsn_rx.borrow() == seg.commit_lsn + && flush_lsn_rx.borrow().term == seg.term + { + tokio::select! { + _ = backup.tli.cancel.cancelled() => { + info!("timeline canceled"); + return; + } + _ = commit_lsn_rx.changed() => {} + _ = flush_lsn_rx.changed() => {} + } + } + } + + // if we don't have any data and zero LSNs, wait for something + while flush_lsn_rx.borrow().lsn == Lsn(0) { + tokio::select! { + _ = backup.tli.cancel.cancelled() => { + info!("timeline canceled"); + return; + } + _ = flush_lsn_rx.changed() => {} + } + } + + // fixing the segno and waiting some time to prevent reuploading the same segment too often + let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn); + let timeout = tokio::time::sleep(await_duration); + tokio::pin!(timeout); + let mut timeout_expired = false; + + // waiting until timeout expires OR segno changes + 'inner: loop { + tokio::select! { + _ = backup.tli.cancel.cancelled() => { + info!("timeline canceled"); + return; + } + _ = commit_lsn_rx.changed() => {} + _ = flush_lsn_rx.changed() => { + let segno = backup.segno(flush_lsn_rx.borrow().lsn); + if segno != pending_segno { + // previous segment is no longer partial, aborting the wait + break 'inner; + } + } + _ = &mut timeout => { + // timeout expired, now we are ready for upload + timeout_expired = true; + break 'inner; + } + } + } + + if !timeout_expired { + // likely segno has changed, let's try again in the next iteration + continue 'outer; + } + + let prepared = backup.prepare_upload().await; + if let Some(seg) = &uploaded_segment { + if seg.eq_without_status(&prepared) { + // we already uploaded this segment, nothing to do + continue 'outer; + } + } + + match backup.do_upload(&prepared).await { + Ok(()) => { + debug!( + "uploaded {} up to flush_lsn {}", + prepared.name, prepared.flush_lsn + ); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc(); + } + Err(e) => { + info!("failed to upload {}: {:#}", prepared.name, e); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc(); + } + } + } +} diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index bceaad1e16..4a97eb3993 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -68,7 +68,7 @@ async fn handle_socket( // is not Unpin, and all pgbackend/framed/tokio dependencies require stream // to be Unpin. Which is reasonable, as indeed something like TimeoutReader // shouldn't be moved. - tokio::pin!(socket); + let socket = std::pin::pin!(socket); let traffic_metrics = TrafficMetrics::new(); if let Some(current_az) = conf.availability_zone.as_deref() { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 8d138c701f..0c1731937c 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -21,10 +21,11 @@ use tokio::fs::{self, remove_file, File, OpenOptions}; use tokio::io::{AsyncRead, AsyncWriteExt}; use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tracing::*; +use utils::crashsafe::durable_rename; use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; -use crate::safekeeper::SafeKeeperState; -use crate::wal_backup::read_object; +use crate::state::TimelinePersistentState; +use crate::wal_backup::{read_object, remote_timeline_path}; use crate::SafeKeeperConf; use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::XLogFileName; @@ -37,6 +38,12 @@ pub trait Storage { /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; + /// Initialize segment by creating proper long header at the beginning of + /// the segment and short header at the page of given LSN. This is only used + /// for timeline initialization because compute will stream data only since + /// init_lsn. Other segment headers are included in compute stream. + async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>; + /// Write piece of WAL from buf to disk, but not necessarily sync it. async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; @@ -77,6 +84,8 @@ pub struct PhysicalStorage { /// Size of WAL segment in bytes. wal_seg_size: usize, + pg_version: u32, + system_id: u64, /// Written to disk, but possibly still in the cache and not fully persisted. /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. @@ -125,7 +134,7 @@ impl PhysicalStorage { ttid: &TenantTimelineId, timeline_dir: Utf8PathBuf, conf: &SafeKeeperConf, - state: &SafeKeeperState, + state: &TimelinePersistentState, ) -> Result { let wal_seg_size = state.server.wal_seg_size as usize; @@ -168,6 +177,8 @@ impl PhysicalStorage { timeline_dir, conf: conf.clone(), wal_seg_size, + pg_version: state.server.pg_version, + system_id: state.server.system_id, write_lsn, write_record_lsn: write_lsn, flush_record_lsn: flush_lsn, @@ -196,15 +207,6 @@ impl PhysicalStorage { Ok(()) } - /// Call fsync if config requires so. - async fn fsync_file(&mut self, file: &File) -> Result<()> { - if !self.conf.no_sync { - self.metrics - .observe_flush_seconds(time_io_closure(file.sync_all()).await?); - } - Ok(()) - } - /// Open or create WAL segment file. Caller must call seek to the wanted position. /// Returns `file` and `is_partial`. async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> { @@ -223,15 +225,34 @@ impl PhysicalStorage { Ok((file, true)) } else { // Create and fill new partial file + // + // We're using fdatasync during WAL writing, so file size must not + // change; to this end it is filled with zeros here. To avoid using + // half initialized segment, first bake it under tmp filename and + // then rename. + let tmp_path = self.timeline_dir.join("waltmp"); + #[allow(clippy::suspicious_open_options)] let mut file = OpenOptions::new() .create(true) .write(true) - .open(&wal_file_partial_path) + .open(&tmp_path) .await - .with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?; + .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?; write_zeroes(&mut file, self.wal_seg_size).await?; - self.fsync_file(&file).await?; + + // Note: this doesn't get into observe_flush_seconds metric. But + // segment init should be separate metric, if any. + if let Err(e) = + durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await + { + // Probably rename succeeded, but fsync of it failed. Remove + // the file then to avoid using it. + remove_file(wal_file_partial_path) + .await + .or_else(utils::fs_ext::ignore_not_found)?; + return Err(e.into()); + } Ok((file, true)) } } @@ -313,6 +334,20 @@ impl Storage for PhysicalStorage { self.flush_record_lsn } + async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> { + let segno = init_lsn.segment_number(self.wal_seg_size); + let (mut file, _) = self.open_or_create(segno).await?; + let major_pg_version = self.pg_version / 10000; + let wal_seg = + postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?; + file.seek(SeekFrom::Start(0)).await?; + file.write_all(&wal_seg).await?; + file.flush().await?; + info!("initialized segno {} at lsn {}", segno, init_lsn); + // note: file is *not* fsynced + Ok(()) + } + /// Write WAL to disk. async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { // Disallow any non-sequential writes, which can result in gaps or overwrites. @@ -501,7 +536,7 @@ async fn remove_segments_from_disk( } pub struct WalReader { - workdir: Utf8PathBuf, + remote_path: RemotePath, timeline_dir: Utf8PathBuf, wal_seg_size: usize, pos: Lsn, @@ -523,9 +558,9 @@ pub struct WalReader { impl WalReader { pub fn new( - workdir: Utf8PathBuf, + ttid: &TenantTimelineId, timeline_dir: Utf8PathBuf, - state: &SafeKeeperState, + state: &TimelinePersistentState, start_pos: Lsn, enable_remote_read: bool, ) -> Result { @@ -551,7 +586,7 @@ impl WalReader { } Ok(Self { - workdir, + remote_path: remote_timeline_path(ttid)?, timeline_dir, wal_seg_size: state.server.wal_seg_size as usize, pos: start_pos, @@ -649,13 +684,12 @@ impl WalReader { let xlogoff = self.pos.segment_offset(self.wal_seg_size); let segno = self.pos.segment_number(self.wal_seg_size); let wal_file_name = XLogFileName(PG_TLI, segno, self.wal_seg_size); - let wal_file_path = self.timeline_dir.join(wal_file_name); // Try to open local file, if we may have WAL locally if self.pos >= self.local_start_lsn { - let res = Self::open_wal_file(&wal_file_path).await; + let res = open_wal_file(&self.timeline_dir, segno, self.wal_seg_size).await; match res { - Ok(mut file) => { + Ok((mut file, _)) => { file.seek(SeekFrom::Start(xlogoff as u64)).await?; return Ok(Box::pin(file)); } @@ -677,40 +711,12 @@ impl WalReader { // Try to open remote file, if remote reads are enabled if self.enable_remote_read { - let remote_wal_file_path = wal_file_path - .strip_prefix(&self.workdir) - .context("Failed to strip workdir prefix") - .and_then(RemotePath::new) - .with_context(|| { - format!( - "Failed to resolve remote part of path {:?} for base {:?}", - wal_file_path, self.workdir, - ) - })?; + let remote_wal_file_path = self.remote_path.join(&wal_file_name); return read_object(&remote_wal_file_path, xlogoff as u64).await; } bail!("WAL segment is not found") } - - /// Helper function for opening a wal file. - async fn open_wal_file(wal_file_path: &Utf8Path) -> Result { - // First try to open the .partial file. - let mut partial_path = wal_file_path.to_owned(); - partial_path.set_extension("partial"); - if let Ok(opened_file) = tokio::fs::File::open(&partial_path).await { - return Ok(opened_file); - } - - // If that failed, try it without the .partial extension. - tokio::fs::File::open(&wal_file_path) - .await - .with_context(|| format!("Failed to open WAL file {:?}", wal_file_path)) - .map_err(|e| { - warn!("{}", e); - e - }) - } } /// Zero block for filling created WAL segments. @@ -718,6 +724,11 @@ const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ]; /// Helper for filling file with zeroes. async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { + fail::fail_point!("sk-write-zeroes", |_| { + info!("write_zeroes hit failpoint"); + Err(anyhow::anyhow!("failpoint: sk-write-zeroes")) + }); + while count >= XLOG_BLCKSZ { file.write_all(ZERO_BLOCK).await?; count -= XLOG_BLCKSZ; @@ -727,6 +738,34 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> { Ok(()) } +/// Helper function for opening WAL segment `segno` in `dir`. Returns file and +/// whether it is .partial. +pub(crate) async fn open_wal_file( + timeline_dir: &Utf8Path, + segno: XLogSegNo, + wal_seg_size: usize, +) -> Result<(tokio::fs::File, bool)> { + let (wal_file_path, wal_file_partial_path) = wal_file_paths(timeline_dir, segno, wal_seg_size)?; + + // First try to open the .partial file. + let mut partial_path = wal_file_path.to_owned(); + partial_path.set_extension("partial"); + if let Ok(opened_file) = tokio::fs::File::open(&wal_file_partial_path).await { + return Ok((opened_file, true)); + } + + // If that failed, try it without the .partial extension. + let pf = tokio::fs::File::open(&wal_file_path) + .await + .with_context(|| format!("failed to open WAL file {:#}", wal_file_path)) + .map_err(|e| { + warn!("{}", e); + e + })?; + + Ok((pf, false)) +} + /// Helper returning full path to WAL segment file and its .partial brother. pub fn wal_file_paths( timeline_dir: &Utf8Path, diff --git a/safekeeper/tests/misc_test.rs b/safekeeper/tests/misc_test.rs new file mode 100644 index 0000000000..8e5b17a143 --- /dev/null +++ b/safekeeper/tests/misc_test.rs @@ -0,0 +1,155 @@ +use std::sync::Arc; + +use tracing::{info, warn}; +use utils::lsn::Lsn; + +use crate::walproposer_sim::{ + log::{init_logger, init_tracing_logger}, + simulation::{generate_network_opts, generate_schedule, Schedule, TestAction, TestConfig}, +}; + +pub mod walproposer_sim; + +// Test that simulation supports restarting (crashing) safekeepers. +#[test] +fn crash_safekeeper() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let mut wp = test.launch_walproposer(lsn); + + // Write some WAL and crash safekeeper 0 without waiting for replication. + test.poll_for_duration(30); + wp.write_tx(3); + test.servers[0].restart(); + + // Wait some time, so that walproposer can reconnect. + test.poll_for_duration(2000); +} + +// Test that walproposer can be crashed (stopped). +#[test] +fn test_simple_restart() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let mut wp = test.launch_walproposer(lsn); + + test.poll_for_duration(30); + wp.write_tx(3); + test.poll_for_duration(100); + + wp.stop(); + drop(wp); + + let lsn = test.sync_safekeepers().unwrap(); + info!("Sucessfully synced safekeepers at {}", lsn); +} + +// Test runnning a simple schedule, restarting everything a several times. +#[test] +fn test_simple_schedule() -> anyhow::Result<()> { + let clock = init_logger(); + let mut config = TestConfig::new(Some(clock)); + config.network.keepalive_timeout = Some(100); + let test = config.start(1337); + + let schedule: Schedule = vec![ + (0, TestAction::RestartWalProposer), + (50, TestAction::WriteTx(5)), + (100, TestAction::RestartSafekeeper(0)), + (100, TestAction::WriteTx(5)), + (110, TestAction::RestartSafekeeper(1)), + (110, TestAction::WriteTx(5)), + (120, TestAction::RestartSafekeeper(2)), + (120, TestAction::WriteTx(5)), + (201, TestAction::RestartWalProposer), + (251, TestAction::RestartSafekeeper(0)), + (251, TestAction::RestartSafekeeper(1)), + (251, TestAction::RestartSafekeeper(2)), + (251, TestAction::WriteTx(5)), + (255, TestAction::WriteTx(5)), + (1000, TestAction::WriteTx(5)), + ]; + + test.run_schedule(&schedule)?; + info!("Test finished, stopping all threads"); + test.world.deallocate(); + + Ok(()) +} + +// Test that simulation can process 10^4 transactions. +#[test] +fn test_many_tx() -> anyhow::Result<()> { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let mut schedule: Schedule = vec![]; + for i in 0..100 { + schedule.push((i * 10, TestAction::WriteTx(100))); + } + + test.run_schedule(&schedule)?; + info!("Test finished, stopping all threads"); + test.world.stop_all(); + + let events = test.world.take_events(); + info!("Events: {:?}", events); + let last_commit_lsn = events + .iter() + .filter_map(|event| { + if event.data.starts_with("commit_lsn;") { + let lsn: u64 = event.data.split(';').nth(1).unwrap().parse().unwrap(); + return Some(lsn); + } + None + }) + .last() + .unwrap(); + + let initdb_lsn = 21623024; + let diff = last_commit_lsn - initdb_lsn; + info!("Last commit lsn: {}, diff: {}", last_commit_lsn, diff); + // each tx is at least 8 bytes, it's written a 100 times for in a loop for 100 times + assert!(diff > 100 * 100 * 8); + Ok(()) +} + +// Checks that we don't have nasty circular dependencies, preventing Arc from deallocating. +// This test doesn't really assert anything, you need to run it manually to check if there +// is any issue. +#[test] +fn test_res_dealloc() -> anyhow::Result<()> { + let clock = init_tracing_logger(true); + let mut config = TestConfig::new(Some(clock)); + + let seed = 123456; + config.network = generate_network_opts(seed); + let test = config.start(seed); + warn!("Running test with seed {}", seed); + + let schedule = generate_schedule(seed); + info!("schedule: {:?}", schedule); + test.run_schedule(&schedule).unwrap(); + test.world.stop_all(); + + let world = test.world.clone(); + drop(test); + info!("world strong count: {}", Arc::strong_count(&world)); + world.deallocate(); + info!("world strong count: {}", Arc::strong_count(&world)); + + Ok(()) +} diff --git a/safekeeper/tests/random_test.rs b/safekeeper/tests/random_test.rs new file mode 100644 index 0000000000..6c6f6a8c96 --- /dev/null +++ b/safekeeper/tests/random_test.rs @@ -0,0 +1,56 @@ +use rand::Rng; +use tracing::{info, warn}; + +use crate::walproposer_sim::{ + log::{init_logger, init_tracing_logger}, + simulation::{generate_network_opts, generate_schedule, TestConfig}, + simulation_logs::validate_events, +}; + +pub mod walproposer_sim; + +// Generates 2000 random seeds and runs a schedule for each of them. +// If you seed this test fail, please report the last seed to the +// @safekeeper team. +#[test] +fn test_random_schedules() -> anyhow::Result<()> { + let clock = init_logger(); + let mut config = TestConfig::new(Some(clock)); + + for _ in 0..2000 { + let seed: u64 = rand::thread_rng().gen(); + config.network = generate_network_opts(seed); + + let test = config.start(seed); + warn!("Running test with seed {}", seed); + + let schedule = generate_schedule(seed); + test.run_schedule(&schedule).unwrap(); + validate_events(test.world.take_events()); + test.world.deallocate(); + } + + Ok(()) +} + +// After you found a seed that fails, you can insert this seed here +// and run the test to see the full debug output. +#[test] +fn test_one_schedule() -> anyhow::Result<()> { + let clock = init_tracing_logger(true); + let mut config = TestConfig::new(Some(clock)); + + let seed = 11047466935058776390; + config.network = generate_network_opts(seed); + info!("network: {:?}", config.network); + let test = config.start(seed); + warn!("Running test with seed {}", seed); + + let schedule = generate_schedule(seed); + info!("schedule: {:?}", schedule); + test.run_schedule(&schedule).unwrap(); + validate_events(test.world.take_events()); + test.world.deallocate(); + + Ok(()) +} diff --git a/safekeeper/tests/simple_test.rs b/safekeeper/tests/simple_test.rs new file mode 100644 index 0000000000..0be9d0deef --- /dev/null +++ b/safekeeper/tests/simple_test.rs @@ -0,0 +1,45 @@ +use tracing::info; +use utils::lsn::Lsn; + +use crate::walproposer_sim::{log::init_logger, simulation::TestConfig}; + +pub mod walproposer_sim; + +// Check that first start of sync_safekeepers() returns 0/0 on empty safekeepers. +#[test] +fn sync_empty_safekeepers() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced (again) empty safekeepers at 0/0"); +} + +// Check that there are no panics when we are writing and streaming WAL to safekeepers. +#[test] +fn run_walproposer_generate_wal() { + let clock = init_logger(); + let config = TestConfig::new(Some(clock)); + let test = config.start(1337); + + let lsn = test.sync_safekeepers().unwrap(); + assert_eq!(lsn, Lsn(0)); + info!("Sucessfully synced empty safekeepers at 0/0"); + + let mut wp = test.launch_walproposer(lsn); + + // wait for walproposer to start + test.poll_for_duration(30); + + // just write some WAL + for _ in 0..100 { + wp.write_tx(1); + test.poll_for_duration(5); + } +} diff --git a/safekeeper/tests/walproposer_sim/block_storage.rs b/safekeeper/tests/walproposer_sim/block_storage.rs new file mode 100644 index 0000000000..468c02ad2f --- /dev/null +++ b/safekeeper/tests/walproposer_sim/block_storage.rs @@ -0,0 +1,57 @@ +use std::collections::HashMap; + +const BLOCK_SIZE: usize = 8192; + +/// A simple in-memory implementation of a block storage. Can be used to implement external +/// storage in tests. +pub struct BlockStorage { + blocks: HashMap, +} + +impl Default for BlockStorage { + fn default() -> Self { + Self::new() + } +} + +impl BlockStorage { + pub fn new() -> Self { + BlockStorage { + blocks: HashMap::new(), + } + } + + pub fn read(&self, pos: u64, buf: &mut [u8]) { + let mut buf_offset = 0; + let mut storage_pos = pos; + while buf_offset < buf.len() { + let block_id = storage_pos / BLOCK_SIZE as u64; + let block = self.blocks.get(&block_id).unwrap_or(&[0; BLOCK_SIZE]); + let block_offset = storage_pos % BLOCK_SIZE as u64; + let block_len = BLOCK_SIZE as u64 - block_offset; + let buf_len = buf.len() - buf_offset; + let copy_len = std::cmp::min(block_len as usize, buf_len); + buf[buf_offset..buf_offset + copy_len] + .copy_from_slice(&block[block_offset as usize..block_offset as usize + copy_len]); + buf_offset += copy_len; + storage_pos += copy_len as u64; + } + } + + pub fn write(&mut self, pos: u64, buf: &[u8]) { + let mut buf_offset = 0; + let mut storage_pos = pos; + while buf_offset < buf.len() { + let block_id = storage_pos / BLOCK_SIZE as u64; + let block = self.blocks.entry(block_id).or_insert([0; BLOCK_SIZE]); + let block_offset = storage_pos % BLOCK_SIZE as u64; + let block_len = BLOCK_SIZE as u64 - block_offset; + let buf_len = buf.len() - buf_offset; + let copy_len = std::cmp::min(block_len as usize, buf_len); + block[block_offset as usize..block_offset as usize + copy_len] + .copy_from_slice(&buf[buf_offset..buf_offset + copy_len]); + buf_offset += copy_len; + storage_pos += copy_len as u64 + } + } +} diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs new file mode 100644 index 0000000000..870f30de4f --- /dev/null +++ b/safekeeper/tests/walproposer_sim/log.rs @@ -0,0 +1,77 @@ +use std::{fmt, sync::Arc}; + +use desim::time::Timing; +use once_cell::sync::OnceCell; +use parking_lot::Mutex; +use tracing_subscriber::fmt::{format::Writer, time::FormatTime}; + +/// SimClock can be plugged into tracing logger to print simulation time. +#[derive(Clone)] +pub struct SimClock { + clock_ptr: Arc>>>, +} + +impl Default for SimClock { + fn default() -> Self { + SimClock { + clock_ptr: Arc::new(Mutex::new(None)), + } + } +} + +impl SimClock { + pub fn set_clock(&self, clock: Arc) { + *self.clock_ptr.lock() = Some(clock); + } +} + +impl FormatTime for SimClock { + fn format_time(&self, w: &mut Writer<'_>) -> fmt::Result { + let clock = self.clock_ptr.lock(); + + if let Some(clock) = clock.as_ref() { + let now = clock.now(); + write!(w, "[{}]", now) + } else { + write!(w, "[?]") + } + } +} + +static LOGGING_DONE: OnceCell = OnceCell::new(); + +/// Returns ptr to clocks attached to tracing logger to update them when the +/// world is (re)created. +pub fn init_tracing_logger(debug_enabled: bool) -> SimClock { + LOGGING_DONE + .get_or_init(|| { + let clock = SimClock::default(); + let base_logger = tracing_subscriber::fmt() + .with_target(false) + // prefix log lines with simulated time timestamp + .with_timer(clock.clone()) + // .with_ansi(true) TODO + .with_max_level(match debug_enabled { + true => tracing::Level::DEBUG, + false => tracing::Level::WARN, + }) + .with_writer(std::io::stdout); + base_logger.init(); + + // logging::replace_panic_hook_with_tracing_panic_hook().forget(); + + if !debug_enabled { + std::panic::set_hook(Box::new(|_| {})); + } + + clock + }) + .clone() +} + +pub fn init_logger() -> SimClock { + // RUST_TRACEBACK envvar controls whether we print all logs or only warnings. + let debug_enabled = std::env::var("RUST_TRACEBACK").is_ok(); + + init_tracing_logger(debug_enabled) +} diff --git a/safekeeper/tests/walproposer_sim/mod.rs b/safekeeper/tests/walproposer_sim/mod.rs new file mode 100644 index 0000000000..ec560dcb3b --- /dev/null +++ b/safekeeper/tests/walproposer_sim/mod.rs @@ -0,0 +1,8 @@ +pub mod block_storage; +pub mod log; +pub mod safekeeper; +pub mod safekeeper_disk; +pub mod simulation; +pub mod simulation_logs; +pub mod walproposer_api; +pub mod walproposer_disk; diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs new file mode 100644 index 0000000000..47539872a6 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -0,0 +1,415 @@ +//! Safekeeper communication endpoint to WAL proposer (compute node). +//! Gets messages from the network, passes them down to consensus module and +//! sends replies back. + +use std::{collections::HashMap, sync::Arc, time::Duration}; + +use anyhow::{bail, Result}; +use bytes::{Bytes, BytesMut}; +use camino::Utf8PathBuf; +use desim::{ + executor::{self, PollSome}, + network::TCP, + node_os::NodeOs, + proto::{AnyMessage, NetEvent, NodeEvent}, +}; +use hyper::Uri; +use safekeeper::{ + safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION}, + state::TimelinePersistentState, + timeline::TimelineError, + wal_storage::Storage, + SafeKeeperConf, +}; +use tracing::{debug, info_span}; +use utils::{ + id::{NodeId, TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use super::safekeeper_disk::{DiskStateStorage, DiskWALStorage, SafekeeperDisk, TimelineDisk}; + +struct SharedState { + sk: SafeKeeper, + disk: Arc, +} + +struct GlobalMap { + timelines: HashMap, + conf: SafeKeeperConf, + disk: Arc, +} + +impl GlobalMap { + /// Restores global state from disk. + fn new(disk: Arc, conf: SafeKeeperConf) -> Result { + let mut timelines = HashMap::new(); + + for (&ttid, disk) in disk.timelines.lock().iter() { + debug!("loading timeline {}", ttid); + let state = disk.state.lock().clone(); + + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(ttid)); + } + + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(ttid)); + } + + if state.commit_lsn < state.local_start_lsn { + bail!( + "commit_lsn {} is higher than local_start_lsn {}", + state.commit_lsn, + state.local_start_lsn + ); + } + + let control_store = DiskStateStorage::new(disk.clone()); + let wal_store = DiskWALStorage::new(disk.clone(), &control_store)?; + + let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?; + timelines.insert( + ttid, + SharedState { + sk, + disk: disk.clone(), + }, + ); + } + + Ok(Self { + timelines, + conf, + disk, + }) + } + + fn create(&mut self, ttid: TenantTimelineId, server_info: ServerInfo) -> Result<()> { + if self.timelines.contains_key(&ttid) { + bail!("timeline {} already exists", ttid); + } + + debug!("creating new timeline {}", ttid); + + let commit_lsn = Lsn::INVALID; + let local_start_lsn = Lsn::INVALID; + + let state = + TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); + + if state.server.wal_seg_size == 0 { + bail!(TimelineError::UninitializedWalSegSize(ttid)); + } + + if state.server.pg_version == UNKNOWN_SERVER_VERSION { + bail!(TimelineError::UninitialinzedPgVersion(ttid)); + } + + if state.commit_lsn < state.local_start_lsn { + bail!( + "commit_lsn {} is higher than local_start_lsn {}", + state.commit_lsn, + state.local_start_lsn + ); + } + + let disk_timeline = self.disk.put_state(&ttid, state); + let control_store = DiskStateStorage::new(disk_timeline.clone()); + let wal_store = DiskWALStorage::new(disk_timeline.clone(), &control_store)?; + + let sk = SafeKeeper::new(control_store, wal_store, self.conf.my_id)?; + + self.timelines.insert( + ttid, + SharedState { + sk, + disk: disk_timeline, + }, + ); + Ok(()) + } + + fn get(&mut self, ttid: &TenantTimelineId) -> &mut SharedState { + self.timelines.get_mut(ttid).expect("timeline must exist") + } + + fn has_tli(&self, ttid: &TenantTimelineId) -> bool { + self.timelines.contains_key(ttid) + } +} + +/// State of a single connection to walproposer. +struct ConnState { + tcp: TCP, + + greeting: bool, + ttid: TenantTimelineId, + flush_pending: bool, + + runtime: tokio::runtime::Runtime, +} + +pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { + let _enter = info_span!("safekeeper", id = os.id()).entered(); + debug!("started server"); + os.log_event("started;safekeeper".to_owned()); + let conf = SafeKeeperConf { + workdir: Utf8PathBuf::from("."), + my_id: NodeId(os.id() as u64), + listen_pg_addr: String::new(), + listen_http_addr: String::new(), + no_sync: false, + broker_endpoint: "/".parse::().unwrap(), + broker_keepalive_interval: Duration::from_secs(0), + heartbeat_timeout: Duration::from_secs(0), + remote_storage: None, + max_offloader_lag_bytes: 0, + wal_backup_enabled: false, + listen_pg_addr_tenant_only: None, + advertise_pg_addr: None, + availability_zone: None, + peer_recovery_enabled: false, + backup_parallel_jobs: 0, + pg_auth: None, + pg_tenant_only_auth: None, + http_auth: None, + sk_auth_token: None, + current_thread_runtime: false, + walsenders_keep_horizon: false, + partial_backup_enabled: false, + partial_backup_timeout: Duration::from_secs(0), + disable_periodic_broker_push: false, + }; + + let mut global = GlobalMap::new(disk, conf.clone())?; + let mut conns: HashMap = HashMap::new(); + + for (&_ttid, shared_state) in global.timelines.iter_mut() { + let flush_lsn = shared_state.sk.wal_store.flush_lsn(); + let commit_lsn = shared_state.sk.state.commit_lsn; + os.log_event(format!("tli_loaded;{};{}", flush_lsn.0, commit_lsn.0)); + } + + let node_events = os.node_events(); + let mut epoll_vec: Vec> = vec![]; + let mut epoll_idx: Vec = vec![]; + + // TODO: batch events processing (multiple events per tick) + loop { + epoll_vec.clear(); + epoll_idx.clear(); + + // node events channel + epoll_vec.push(Box::new(node_events.clone())); + epoll_idx.push(0); + + // tcp connections + for conn in conns.values() { + epoll_vec.push(Box::new(conn.tcp.recv_chan())); + epoll_idx.push(conn.tcp.connection_id()); + } + + // waiting for the next message + let index = executor::epoll_chans(&epoll_vec, -1).unwrap(); + + if index == 0 { + // got a new connection + match node_events.must_recv() { + NodeEvent::Accept(tcp) => { + conns.insert( + tcp.connection_id(), + ConnState { + tcp, + greeting: false, + ttid: TenantTimelineId::empty(), + flush_pending: false, + runtime: tokio::runtime::Builder::new_current_thread().build()?, + }, + ); + } + NodeEvent::Internal(_) => unreachable!(), + } + continue; + } + + let connection_id = epoll_idx[index]; + let conn = conns.get_mut(&connection_id).unwrap(); + let mut next_event = Some(conn.tcp.recv_chan().must_recv()); + + loop { + let event = match next_event { + Some(event) => event, + None => break, + }; + + match event { + NetEvent::Message(msg) => { + let res = conn.process_any(msg, &mut global); + if res.is_err() { + debug!("conn {:?} error: {:#}", connection_id, res.unwrap_err()); + conns.remove(&connection_id); + break; + } + } + NetEvent::Closed => { + // TODO: remove from conns? + } + } + + next_event = conn.tcp.recv_chan().try_recv(); + } + + conns.retain(|_, conn| { + let res = conn.flush(&mut global); + if res.is_err() { + debug!("conn {:?} error: {:?}", conn.tcp, res); + } + res.is_ok() + }); + } +} + +impl ConnState { + /// Process a message from the network. It can be START_REPLICATION request or a valid ProposerAcceptorMessage message. + fn process_any(&mut self, any: AnyMessage, global: &mut GlobalMap) -> Result<()> { + if let AnyMessage::Bytes(copy_data) = any { + let repl_prefix = b"START_REPLICATION "; + if !self.greeting && copy_data.starts_with(repl_prefix) { + self.process_start_replication(copy_data.slice(repl_prefix.len()..), global)?; + bail!("finished processing START_REPLICATION") + } + + let msg = ProposerAcceptorMessage::parse(copy_data)?; + debug!("got msg: {:?}", msg); + self.process(msg, global) + } else { + bail!("unexpected message, expected AnyMessage::Bytes"); + } + } + + /// Process START_REPLICATION request. + fn process_start_replication( + &mut self, + copy_data: Bytes, + global: &mut GlobalMap, + ) -> Result<()> { + // format is " " + let str = String::from_utf8(copy_data.to_vec())?; + + let mut parts = str.split(' '); + let tenant_id = parts.next().unwrap().parse::()?; + let timeline_id = parts.next().unwrap().parse::()?; + let start_lsn = parts.next().unwrap().parse::()?; + let end_lsn = parts.next().unwrap().parse::()?; + + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + let shared_state = global.get(&ttid); + + // read bytes from start_lsn to end_lsn + let mut buf = vec![0; (end_lsn - start_lsn) as usize]; + shared_state.disk.wal.lock().read(start_lsn, &mut buf); + + // send bytes to the client + self.tcp.send(AnyMessage::Bytes(Bytes::from(buf))); + Ok(()) + } + + /// Get or create a timeline. + fn init_timeline( + &mut self, + ttid: TenantTimelineId, + server_info: ServerInfo, + global: &mut GlobalMap, + ) -> Result<()> { + self.ttid = ttid; + if global.has_tli(&ttid) { + return Ok(()); + } + + global.create(ttid, server_info) + } + + /// Process a ProposerAcceptorMessage. + fn process(&mut self, msg: ProposerAcceptorMessage, global: &mut GlobalMap) -> Result<()> { + if !self.greeting { + self.greeting = true; + + match msg { + ProposerAcceptorMessage::Greeting(ref greeting) => { + tracing::info!( + "start handshake with walproposer {:?} {:?}", + self.tcp, + greeting + ); + let server_info = ServerInfo { + pg_version: greeting.pg_version, + system_id: greeting.system_id, + wal_seg_size: greeting.wal_seg_size, + }; + let ttid = TenantTimelineId::new(greeting.tenant_id, greeting.timeline_id); + self.init_timeline(ttid, server_info, global)? + } + _ => { + bail!("unexpected message {msg:?} instead of greeting"); + } + } + } + + let tli = global.get(&self.ttid); + + match msg { + ProposerAcceptorMessage::AppendRequest(append_request) => { + self.flush_pending = true; + self.process_sk_msg( + tli, + &ProposerAcceptorMessage::NoFlushAppendRequest(append_request), + )?; + } + other => { + self.process_sk_msg(tli, &other)?; + } + } + + Ok(()) + } + + /// Process FlushWAL if needed. + fn flush(&mut self, global: &mut GlobalMap) -> Result<()> { + // TODO: try to add extra flushes in simulation, to verify that extra flushes don't break anything + if !self.flush_pending { + return Ok(()); + } + self.flush_pending = false; + let shared_state = global.get(&self.ttid); + self.process_sk_msg(shared_state, &ProposerAcceptorMessage::FlushWAL) + } + + /// Make safekeeper process a message and send a reply to the TCP + fn process_sk_msg( + &mut self, + shared_state: &mut SharedState, + msg: &ProposerAcceptorMessage, + ) -> Result<()> { + let mut reply = self.runtime.block_on(shared_state.sk.process_msg(msg))?; + if let Some(reply) = &mut reply { + // TODO: if this is AppendResponse, fill in proper hot standby feedback and disk consistent lsn + + let mut buf = BytesMut::with_capacity(128); + reply.serialize(&mut buf)?; + + self.tcp.send(AnyMessage::Bytes(buf.into())); + } + Ok(()) + } +} + +impl Drop for ConnState { + fn drop(&mut self) { + debug!("dropping conn: {:?}", self.tcp); + if !std::thread::panicking() { + self.tcp.close(); + } + // TODO: clean up non-fsynced WAL + } +} diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs new file mode 100644 index 0000000000..c2db9de78a --- /dev/null +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -0,0 +1,282 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use parking_lot::Mutex; +use safekeeper::state::TimelinePersistentState; +use utils::id::TenantTimelineId; + +use super::block_storage::BlockStorage; + +use std::{ops::Deref, time::Instant}; + +use anyhow::Result; +use bytes::{Buf, BytesMut}; +use futures::future::BoxFuture; +use postgres_ffi::{waldecoder::WalStreamDecoder, XLogSegNo}; +use safekeeper::{control_file, metrics::WalStorageMetrics, wal_storage}; +use tracing::{debug, info}; +use utils::lsn::Lsn; + +/// All safekeeper state that is usually saved to disk. +pub struct SafekeeperDisk { + pub timelines: Mutex>>, +} + +impl Default for SafekeeperDisk { + fn default() -> Self { + Self::new() + } +} + +impl SafekeeperDisk { + pub fn new() -> Self { + SafekeeperDisk { + timelines: Mutex::new(HashMap::new()), + } + } + + pub fn put_state( + &self, + ttid: &TenantTimelineId, + state: TimelinePersistentState, + ) -> Arc { + self.timelines + .lock() + .entry(*ttid) + .and_modify(|e| { + let mut mu = e.state.lock(); + *mu = state.clone(); + }) + .or_insert_with(|| { + Arc::new(TimelineDisk { + state: Mutex::new(state), + wal: Mutex::new(BlockStorage::new()), + }) + }) + .clone() + } +} + +/// Control file state and WAL storage. +pub struct TimelineDisk { + pub state: Mutex, + pub wal: Mutex, +} + +/// Implementation of `control_file::Storage` trait. +pub struct DiskStateStorage { + persisted_state: TimelinePersistentState, + disk: Arc, + last_persist_at: Instant, +} + +impl DiskStateStorage { + pub fn new(disk: Arc) -> Self { + let guard = disk.state.lock(); + let state = guard.clone(); + drop(guard); + DiskStateStorage { + persisted_state: state, + disk, + last_persist_at: Instant::now(), + } + } +} + +#[async_trait::async_trait] +impl control_file::Storage for DiskStateStorage { + /// Persist safekeeper state on disk and update internal state. + async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { + self.persisted_state = s.clone(); + *self.disk.state.lock() = s.clone(); + Ok(()) + } + + /// Timestamp of last persist. + fn last_persist_at(&self) -> Instant { + // TODO: don't rely on it in tests + self.last_persist_at + } +} + +impl Deref for DiskStateStorage { + type Target = TimelinePersistentState; + + fn deref(&self) -> &Self::Target { + &self.persisted_state + } +} + +/// Implementation of `wal_storage::Storage` trait. +pub struct DiskWALStorage { + /// Written to disk, but possibly still in the cache and not fully persisted. + /// Also can be ahead of record_lsn, if happen to be in the middle of a WAL record. + write_lsn: Lsn, + + /// The LSN of the last WAL record written to disk. Still can be not fully flushed. + write_record_lsn: Lsn, + + /// The LSN of the last WAL record flushed to disk. + flush_record_lsn: Lsn, + + /// Decoder is required for detecting boundaries of WAL records. + decoder: WalStreamDecoder, + + /// Bytes of WAL records that are not yet written to disk. + unflushed_bytes: BytesMut, + + /// Contains BlockStorage for WAL. + disk: Arc, +} + +impl DiskWALStorage { + pub fn new(disk: Arc, state: &TimelinePersistentState) -> Result { + let write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + Self::find_end_of_wal(disk.clone(), state.commit_lsn)? + }; + + let flush_lsn = write_lsn; + Ok(DiskWALStorage { + write_lsn, + write_record_lsn: flush_lsn, + flush_record_lsn: flush_lsn, + decoder: WalStreamDecoder::new(flush_lsn, 16), + unflushed_bytes: BytesMut::new(), + disk, + }) + } + + fn find_end_of_wal(disk: Arc, start_lsn: Lsn) -> Result { + let mut buf = [0; 8192]; + let mut pos = start_lsn.0; + let mut decoder = WalStreamDecoder::new(start_lsn, 16); + let mut result = start_lsn; + loop { + disk.wal.lock().read(pos, &mut buf); + pos += buf.len() as u64; + decoder.feed_bytes(&buf); + + loop { + match decoder.poll_decode() { + Ok(Some(record)) => result = record.0, + Err(e) => { + debug!( + "find_end_of_wal reached end at {:?}, decode error: {:?}", + result, e + ); + return Ok(result); + } + Ok(None) => break, // need more data + } + } + } + } +} + +#[async_trait::async_trait] +impl wal_storage::Storage for DiskWALStorage { + /// LSN of last durably stored WAL record. + fn flush_lsn(&self) -> Lsn { + self.flush_record_lsn + } + + async fn initialize_first_segment(&mut self, _init_lsn: Lsn) -> Result<()> { + Ok(()) + } + + /// Write piece of WAL from buf to disk, but not necessarily sync it. + async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> { + if self.write_lsn != startpos { + panic!("write_wal called with wrong startpos"); + } + + self.unflushed_bytes.extend_from_slice(buf); + self.write_lsn += buf.len() as u64; + + if self.decoder.available() != startpos { + info!( + "restart decoder from {} to {}", + self.decoder.available(), + startpos, + ); + self.decoder = WalStreamDecoder::new(startpos, 16); + } + self.decoder.feed_bytes(buf); + loop { + match self.decoder.poll_decode()? { + None => break, // no full record yet + Some((lsn, _rec)) => { + self.write_record_lsn = lsn; + } + } + } + + Ok(()) + } + + /// Truncate WAL at specified LSN, which must be the end of WAL record. + async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { + panic!( + "truncate_wal called on non-written WAL, write_lsn={}, end_pos={}", + self.write_lsn, end_pos + ); + } + + self.flush_wal().await?; + + // write zeroes to disk from end_pos until self.write_lsn + let buf = [0; 8192]; + let mut pos = end_pos.0; + while pos < self.write_lsn.0 { + self.disk.wal.lock().write(pos, &buf); + pos += buf.len() as u64; + } + + self.write_lsn = end_pos; + self.write_record_lsn = end_pos; + self.flush_record_lsn = end_pos; + self.unflushed_bytes.clear(); + self.decoder = WalStreamDecoder::new(end_pos, 16); + + Ok(()) + } + + /// Durably store WAL on disk, up to the last written WAL record. + async fn flush_wal(&mut self) -> Result<()> { + if self.flush_record_lsn == self.write_record_lsn { + // no need to do extra flush + return Ok(()); + } + + let num_bytes = self.write_record_lsn.0 - self.flush_record_lsn.0; + + self.disk.wal.lock().write( + self.flush_record_lsn.0, + &self.unflushed_bytes[..num_bytes as usize], + ); + self.unflushed_bytes.advance(num_bytes as usize); + self.flush_record_lsn = self.write_record_lsn; + + Ok(()) + } + + /// Remove all segments <= given segno. Returns function doing that as we + /// want to perform it without timeline lock. + fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> { + Box::pin(async move { Ok(()) }) + } + + /// Release resources associated with the storage -- technically, close FDs. + /// Currently we don't remove timelines until restart (#3146), so need to + /// spare descriptors. This would be useful for temporary tli detach as + /// well. + fn close(&mut self) {} + + /// Get metrics for this timeline. + fn get_metrics(&self) -> WalStorageMetrics { + WalStorageMetrics::default() + } +} diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs new file mode 100644 index 0000000000..0d7aaf517b --- /dev/null +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -0,0 +1,436 @@ +use std::{cell::Cell, str::FromStr, sync::Arc}; + +use crate::walproposer_sim::{safekeeper::run_server, walproposer_api::SimulationApi}; +use desim::{ + executor::{self, ExternalHandle}, + node_os::NodeOs, + options::{Delay, NetworkOptions}, + proto::{AnyMessage, NodeEvent}, + world::Node, + world::World, +}; +use rand::{Rng, SeedableRng}; +use tracing::{debug, info_span, warn}; +use utils::{id::TenantTimelineId, lsn::Lsn}; +use walproposer::walproposer::{Config, Wrapper}; + +use super::{ + log::SimClock, safekeeper_disk::SafekeeperDisk, walproposer_api, + walproposer_disk::DiskWalProposer, +}; + +/// Simulated safekeeper node. +pub struct SafekeeperNode { + pub node: Arc, + pub id: u32, + pub disk: Arc, + pub thread: Cell, +} + +impl SafekeeperNode { + /// Create and start a safekeeper at the specified Node. + pub fn new(node: Arc) -> Self { + let disk = Arc::new(SafekeeperDisk::new()); + let thread = Cell::new(SafekeeperNode::launch(disk.clone(), node.clone())); + + Self { + id: node.id, + node, + disk, + thread, + } + } + + fn launch(disk: Arc, node: Arc) -> ExternalHandle { + // start the server thread + node.launch(move |os| { + run_server(os, disk).expect("server should finish without errors"); + }) + } + + /// Restart the safekeeper. + pub fn restart(&self) { + let new_thread = SafekeeperNode::launch(self.disk.clone(), self.node.clone()); + let old_thread = self.thread.replace(new_thread); + old_thread.crash_stop(); + } +} + +/// Simulated walproposer node. +pub struct WalProposer { + thread: ExternalHandle, + node: Arc, + disk: Arc, + sync_safekeepers: bool, +} + +impl WalProposer { + /// Generic start function for both modes. + fn start( + os: NodeOs, + disk: Arc, + ttid: TenantTimelineId, + addrs: Vec, + lsn: Option, + ) { + let sync_safekeepers = lsn.is_none(); + + let _enter = if sync_safekeepers { + info_span!("sync", started = executor::now()).entered() + } else { + info_span!("walproposer", started = executor::now()).entered() + }; + + os.log_event(format!("started;walproposer;{}", sync_safekeepers as i32)); + + let config = Config { + ttid, + safekeepers_list: addrs, + safekeeper_reconnect_timeout: 1000, + safekeeper_connection_timeout: 5000, + sync_safekeepers, + }; + let args = walproposer_api::Args { + os, + config: config.clone(), + disk, + redo_start_lsn: lsn, + }; + let api = SimulationApi::new(args); + let wp = Wrapper::new(Box::new(api), config); + wp.start(); + } + + /// Start walproposer in a sync_safekeepers mode. + pub fn launch_sync(ttid: TenantTimelineId, addrs: Vec, node: Arc) -> Self { + debug!("sync_safekeepers started at node {}", node.id); + let disk = DiskWalProposer::new(); + let disk_wp = disk.clone(); + + // start the client thread + let handle = node.launch(move |os| { + WalProposer::start(os, disk_wp, ttid, addrs, None); + }); + + Self { + thread: handle, + node, + disk, + sync_safekeepers: true, + } + } + + /// Start walproposer in a normal mode. + pub fn launch_walproposer( + ttid: TenantTimelineId, + addrs: Vec, + node: Arc, + lsn: Lsn, + ) -> Self { + debug!("walproposer started at node {}", node.id); + let disk = DiskWalProposer::new(); + disk.lock().reset_to(lsn); + let disk_wp = disk.clone(); + + // start the client thread + let handle = node.launch(move |os| { + WalProposer::start(os, disk_wp, ttid, addrs, Some(lsn)); + }); + + Self { + thread: handle, + node, + disk, + sync_safekeepers: false, + } + } + + pub fn write_tx(&mut self, cnt: usize) { + let start_lsn = self.disk.lock().flush_rec_ptr(); + + for _ in 0..cnt { + self.disk + .lock() + .insert_logical_message("prefix", b"message") + .expect("failed to generate logical message"); + } + + let end_lsn = self.disk.lock().flush_rec_ptr(); + + // log event + self.node + .log_event(format!("write_wal;{};{};{}", start_lsn.0, end_lsn.0, cnt)); + + // now we need to set "Latch" in walproposer + self.node + .node_events() + .send(NodeEvent::Internal(AnyMessage::Just32(0))); + } + + pub fn stop(&self) { + self.thread.crash_stop(); + } +} + +/// Holds basic simulation settings, such as network options. +pub struct TestConfig { + pub network: NetworkOptions, + pub timeout: u64, + pub clock: Option, +} + +impl TestConfig { + /// Create a new TestConfig with default settings. + pub fn new(clock: Option) -> Self { + Self { + network: NetworkOptions { + keepalive_timeout: Some(2000), + connect_delay: Delay { + min: 1, + max: 5, + fail_prob: 0.0, + }, + send_delay: Delay { + min: 1, + max: 5, + fail_prob: 0.0, + }, + }, + timeout: 1_000 * 10, + clock, + } + } + + /// Start a new simulation with the specified seed. + pub fn start(&self, seed: u64) -> Test { + let world = Arc::new(World::new(seed, Arc::new(self.network.clone()))); + + if let Some(clock) = &self.clock { + clock.set_clock(world.clock()); + } + + let servers = [ + SafekeeperNode::new(world.new_node()), + SafekeeperNode::new(world.new_node()), + SafekeeperNode::new(world.new_node()), + ]; + + let server_ids = [servers[0].id, servers[1].id, servers[2].id]; + let safekeepers_addrs = server_ids.map(|id| format!("node:{}", id)).to_vec(); + + let ttid = TenantTimelineId::generate(); + + Test { + world, + servers, + sk_list: safekeepers_addrs, + ttid, + timeout: self.timeout, + } + } +} + +/// Holds simulation state. +pub struct Test { + pub world: Arc, + pub servers: [SafekeeperNode; 3], + pub sk_list: Vec, + pub ttid: TenantTimelineId, + pub timeout: u64, +} + +impl Test { + /// Start a sync_safekeepers thread and wait for it to finish. + pub fn sync_safekeepers(&self) -> anyhow::Result { + let wp = self.launch_sync_safekeepers(); + + // poll until exit or timeout + let time_limit = self.timeout; + while self.world.step() && self.world.now() < time_limit && !wp.thread.is_finished() {} + + if !wp.thread.is_finished() { + anyhow::bail!("timeout or idle stuck"); + } + + let res = wp.thread.result(); + if res.0 != 0 { + anyhow::bail!("non-zero exitcode: {:?}", res); + } + let lsn = Lsn::from_str(&res.1)?; + Ok(lsn) + } + + /// Spawn a new sync_safekeepers thread. + pub fn launch_sync_safekeepers(&self) -> WalProposer { + WalProposer::launch_sync(self.ttid, self.sk_list.clone(), self.world.new_node()) + } + + /// Spawn a new walproposer thread. + pub fn launch_walproposer(&self, lsn: Lsn) -> WalProposer { + let lsn = if lsn.0 == 0 { + // usual LSN after basebackup + Lsn(21623024) + } else { + lsn + }; + + WalProposer::launch_walproposer(self.ttid, self.sk_list.clone(), self.world.new_node(), lsn) + } + + /// Execute the simulation for the specified duration. + pub fn poll_for_duration(&self, duration: u64) { + let time_limit = std::cmp::min(self.world.now() + duration, self.timeout); + while self.world.step() && self.world.now() < time_limit {} + } + + /// Execute the simulation together with events defined in some schedule. + pub fn run_schedule(&self, schedule: &Schedule) -> anyhow::Result<()> { + // scheduling empty events so that world will stop in those points + { + let clock = self.world.clock(); + + let now = self.world.now(); + for (time, _) in schedule { + if *time < now { + continue; + } + clock.schedule_fake(*time - now); + } + } + + let mut wp = self.launch_sync_safekeepers(); + + let mut skipped_tx = 0; + let mut started_tx = 0; + + let mut schedule_ptr = 0; + + loop { + if wp.sync_safekeepers && wp.thread.is_finished() { + let res = wp.thread.result(); + if res.0 != 0 { + warn!("sync non-zero exitcode: {:?}", res); + debug!("restarting sync_safekeepers"); + // restart the sync_safekeepers + wp = self.launch_sync_safekeepers(); + continue; + } + let lsn = Lsn::from_str(&res.1)?; + debug!("sync_safekeepers finished at LSN {}", lsn); + wp = self.launch_walproposer(lsn); + debug!("walproposer started at thread {}", wp.thread.id()); + } + + let now = self.world.now(); + while schedule_ptr < schedule.len() && schedule[schedule_ptr].0 <= now { + if now != schedule[schedule_ptr].0 { + warn!("skipped event {:?} at {}", schedule[schedule_ptr], now); + } + + let action = &schedule[schedule_ptr].1; + match action { + TestAction::WriteTx(size) => { + if !wp.sync_safekeepers && !wp.thread.is_finished() { + started_tx += *size; + wp.write_tx(*size); + debug!("written {} transactions", size); + } else { + skipped_tx += size; + debug!("skipped {} transactions", size); + } + } + TestAction::RestartSafekeeper(id) => { + debug!("restarting safekeeper {}", id); + self.servers[*id].restart(); + } + TestAction::RestartWalProposer => { + debug!("restarting sync_safekeepers"); + wp.stop(); + wp = self.launch_sync_safekeepers(); + } + } + schedule_ptr += 1; + } + + if schedule_ptr == schedule.len() { + break; + } + let next_event_time = schedule[schedule_ptr].0; + + // poll until the next event + if wp.thread.is_finished() { + while self.world.step() && self.world.now() < next_event_time {} + } else { + while self.world.step() + && self.world.now() < next_event_time + && !wp.thread.is_finished() + {} + } + } + + debug!( + "finished schedule, total steps: {}", + self.world.get_thread_step_count() + ); + debug!("skipped_tx: {}", skipped_tx); + debug!("started_tx: {}", started_tx); + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub enum TestAction { + WriteTx(usize), + RestartSafekeeper(usize), + RestartWalProposer, +} + +pub type Schedule = Vec<(u64, TestAction)>; + +pub fn generate_schedule(seed: u64) -> Schedule { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + let mut schedule = Vec::new(); + let mut time = 0; + + let cnt = rng.gen_range(1..100); + + for _ in 0..cnt { + time += rng.gen_range(0..500); + let action = match rng.gen_range(0..3) { + 0 => TestAction::WriteTx(rng.gen_range(1..10)), + 1 => TestAction::RestartSafekeeper(rng.gen_range(0..3)), + 2 => TestAction::RestartWalProposer, + _ => unreachable!(), + }; + schedule.push((time, action)); + } + + schedule +} + +pub fn generate_network_opts(seed: u64) -> NetworkOptions { + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + + let timeout = rng.gen_range(100..2000); + let max_delay = rng.gen_range(1..2 * timeout); + let min_delay = rng.gen_range(1..=max_delay); + + let max_fail_prob = rng.gen_range(0.0..0.9); + let connect_fail_prob = rng.gen_range(0.0..max_fail_prob); + let send_fail_prob = rng.gen_range(0.0..connect_fail_prob); + + NetworkOptions { + keepalive_timeout: Some(timeout), + connect_delay: Delay { + min: min_delay, + max: max_delay, + fail_prob: connect_fail_prob, + }, + send_delay: Delay { + min: min_delay, + max: max_delay, + fail_prob: send_fail_prob, + }, + } +} diff --git a/safekeeper/tests/walproposer_sim/simulation_logs.rs b/safekeeper/tests/walproposer_sim/simulation_logs.rs new file mode 100644 index 0000000000..38885e5dd0 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/simulation_logs.rs @@ -0,0 +1,187 @@ +use desim::proto::SimEvent; +use tracing::debug; + +#[derive(Debug, Clone, PartialEq, Eq)] +enum NodeKind { + Unknown, + Safekeeper, + WalProposer, +} + +impl Default for NodeKind { + fn default() -> Self { + Self::Unknown + } +} + +/// Simulation state of walproposer/safekeeper, derived from the simulation logs. +#[derive(Clone, Debug, Default)] +struct NodeInfo { + kind: NodeKind, + + // walproposer + is_sync: bool, + term: u64, + epoch_lsn: u64, + + // safekeeper + commit_lsn: u64, + flush_lsn: u64, +} + +impl NodeInfo { + fn init_kind(&mut self, kind: NodeKind) { + if self.kind == NodeKind::Unknown { + self.kind = kind; + } else { + assert!(self.kind == kind); + } + } + + fn started(&mut self, data: &str) { + let mut parts = data.split(';'); + assert!(parts.next().unwrap() == "started"); + match parts.next().unwrap() { + "safekeeper" => { + self.init_kind(NodeKind::Safekeeper); + } + "walproposer" => { + self.init_kind(NodeKind::WalProposer); + let is_sync: u8 = parts.next().unwrap().parse().unwrap(); + self.is_sync = is_sync != 0; + } + _ => unreachable!(), + } + } +} + +/// Global state of the simulation, derived from the simulation logs. +#[derive(Debug, Default)] +struct GlobalState { + nodes: Vec, + commit_lsn: u64, + write_lsn: u64, + max_write_lsn: u64, + + written_wal: u64, + written_records: u64, +} + +impl GlobalState { + fn new() -> Self { + Default::default() + } + + fn get(&mut self, id: u32) -> &mut NodeInfo { + let id = id as usize; + if id >= self.nodes.len() { + self.nodes.resize(id + 1, NodeInfo::default()); + } + &mut self.nodes[id] + } +} + +/// Try to find inconsistencies in the simulation log. +pub fn validate_events(events: Vec) { + const INITDB_LSN: u64 = 21623024; + + let hook = std::panic::take_hook(); + scopeguard::defer_on_success! { + std::panic::set_hook(hook); + }; + + let mut state = GlobalState::new(); + state.max_write_lsn = INITDB_LSN; + + for event in events { + debug!("{:?}", event); + + let node = state.get(event.node); + if event.data.starts_with("started;") { + node.started(&event.data); + continue; + } + assert!(node.kind != NodeKind::Unknown); + + // drop reference to unlock state + let mut node = node.clone(); + + let mut parts = event.data.split(';'); + match node.kind { + NodeKind::Safekeeper => match parts.next().unwrap() { + "tli_loaded" => { + let flush_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let commit_lsn: u64 = parts.next().unwrap().parse().unwrap(); + node.flush_lsn = flush_lsn; + node.commit_lsn = commit_lsn; + } + _ => unreachable!(), + }, + NodeKind::WalProposer => { + match parts.next().unwrap() { + "prop_elected" => { + let prop_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let prop_term: u64 = parts.next().unwrap().parse().unwrap(); + let prev_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let prev_term: u64 = parts.next().unwrap().parse().unwrap(); + + assert!(prop_lsn >= prev_lsn); + assert!(prop_term >= prev_term); + + assert!(prop_lsn >= state.commit_lsn); + + if prop_lsn > state.write_lsn { + assert!(prop_lsn <= state.max_write_lsn); + debug!( + "moving write_lsn up from {} to {}", + state.write_lsn, prop_lsn + ); + state.write_lsn = prop_lsn; + } + if prop_lsn < state.write_lsn { + debug!( + "moving write_lsn down from {} to {}", + state.write_lsn, prop_lsn + ); + state.write_lsn = prop_lsn; + } + + node.epoch_lsn = prop_lsn; + node.term = prop_term; + } + "write_wal" => { + assert!(!node.is_sync); + let start_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let end_lsn: u64 = parts.next().unwrap().parse().unwrap(); + let cnt: u64 = parts.next().unwrap().parse().unwrap(); + + let size = end_lsn - start_lsn; + state.written_wal += size; + state.written_records += cnt; + + // TODO: If we allow writing WAL before winning the election + + assert!(start_lsn >= state.commit_lsn); + assert!(end_lsn >= start_lsn); + // assert!(start_lsn == state.write_lsn); + state.write_lsn = end_lsn; + + if end_lsn > state.max_write_lsn { + state.max_write_lsn = end_lsn; + } + } + "commit_lsn" => { + let lsn: u64 = parts.next().unwrap().parse().unwrap(); + assert!(lsn >= state.commit_lsn); + state.commit_lsn = lsn; + } + _ => unreachable!(), + } + } + _ => unreachable!(), + } + + // update the node in the state struct + *state.get(event.node) = node; + } +} diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs new file mode 100644 index 0000000000..5578c94cf6 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -0,0 +1,673 @@ +use std::{ + cell::{RefCell, RefMut, UnsafeCell}, + ffi::CStr, + sync::Arc, +}; + +use bytes::Bytes; +use desim::{ + executor::{self, PollSome}, + network::TCP, + node_os::NodeOs, + proto::{AnyMessage, NetEvent, NodeEvent}, + world::NodeId, +}; +use tracing::debug; +use utils::lsn::Lsn; +use walproposer::{ + api_bindings::Level, + bindings::{ + NeonWALReadResult, SafekeeperStateDesiredEvents, WL_SOCKET_READABLE, WL_SOCKET_WRITEABLE, + }, + walproposer::{ApiImpl, Config}, +}; + +use super::walproposer_disk::DiskWalProposer; + +/// Special state for each wp->sk connection. +struct SafekeeperConn { + host: String, + port: String, + node_id: NodeId, + // socket is Some(..) equals to connection is established + socket: Option, + // connection is in progress + is_connecting: bool, + // START_WAL_PUSH is in progress + is_start_wal_push: bool, + // pointer to Safekeeper in walproposer for callbacks + raw_ptr: *mut walproposer::bindings::Safekeeper, +} + +impl SafekeeperConn { + pub fn new(host: String, port: String) -> Self { + // port number is the same as NodeId + let port_num = port.parse::().unwrap(); + Self { + host, + port, + node_id: port_num, + socket: None, + is_connecting: false, + is_start_wal_push: false, + raw_ptr: std::ptr::null_mut(), + } + } +} + +/// Simulation version of a postgres WaitEventSet. At pos 0 there is always +/// a special NodeEvents channel, which is used as a latch. +struct EventSet { + os: NodeOs, + // all pollable channels, 0 is always NodeEvent channel + chans: Vec>, + // 0 is always nullptr + sk_ptrs: Vec<*mut walproposer::bindings::Safekeeper>, + // event mask for each channel + masks: Vec, +} + +impl EventSet { + pub fn new(os: NodeOs) -> Self { + let node_events = os.node_events(); + Self { + os, + chans: vec![Box::new(node_events)], + sk_ptrs: vec![std::ptr::null_mut()], + masks: vec![WL_SOCKET_READABLE], + } + } + + /// Leaves all readable channels at the beginning of the array. + fn sort_readable(&mut self) -> usize { + let mut cnt = 1; + for i in 1..self.chans.len() { + if self.masks[i] & WL_SOCKET_READABLE != 0 { + self.chans.swap(i, cnt); + self.sk_ptrs.swap(i, cnt); + self.masks.swap(i, cnt); + cnt += 1; + } + } + cnt + } + + fn update_event_set(&mut self, conn: &SafekeeperConn, event_mask: u32) { + let index = self + .sk_ptrs + .iter() + .position(|&ptr| ptr == conn.raw_ptr) + .expect("safekeeper should exist in event set"); + self.masks[index] = event_mask; + } + + fn add_safekeeper(&mut self, sk: &SafekeeperConn, event_mask: u32) { + for ptr in self.sk_ptrs.iter() { + assert!(*ptr != sk.raw_ptr); + } + + self.chans.push(Box::new( + sk.socket + .as_ref() + .expect("socket should not be closed") + .recv_chan(), + )); + self.sk_ptrs.push(sk.raw_ptr); + self.masks.push(event_mask); + } + + fn remove_safekeeper(&mut self, sk: &SafekeeperConn) { + let index = self.sk_ptrs.iter().position(|&ptr| ptr == sk.raw_ptr); + if index.is_none() { + debug!("remove_safekeeper: sk={:?} not found", sk.raw_ptr); + return; + } + let index = index.unwrap(); + + self.chans.remove(index); + self.sk_ptrs.remove(index); + self.masks.remove(index); + + // to simulate the actual behaviour + self.refresh_event_set(); + } + + /// Updates all masks to match the result of a SafekeeperStateDesiredEvents. + fn refresh_event_set(&mut self) { + for (i, mask) in self.masks.iter_mut().enumerate() { + if i == 0 { + continue; + } + + let mut mask_sk: u32 = 0; + let mut mask_nwr: u32 = 0; + unsafe { SafekeeperStateDesiredEvents(self.sk_ptrs[i], &mut mask_sk, &mut mask_nwr) }; + + if mask_sk != *mask { + debug!( + "refresh_event_set: sk={:?}, old_mask={:#b}, new_mask={:#b}", + self.sk_ptrs[i], *mask, mask_sk + ); + *mask = mask_sk; + } + } + } + + /// Wait for events on all channels. + fn wait(&mut self, timeout_millis: i64) -> walproposer::walproposer::WaitResult { + // all channels are always writeable + for (i, mask) in self.masks.iter().enumerate() { + if *mask & WL_SOCKET_WRITEABLE != 0 { + return walproposer::walproposer::WaitResult::Network( + self.sk_ptrs[i], + WL_SOCKET_WRITEABLE, + ); + } + } + + let cnt = self.sort_readable(); + + let slice = &self.chans[0..cnt]; + match executor::epoll_chans(slice, timeout_millis) { + None => walproposer::walproposer::WaitResult::Timeout, + Some(0) => { + let msg = self.os.node_events().must_recv(); + match msg { + NodeEvent::Internal(AnyMessage::Just32(0)) => { + // got a notification about new WAL available + } + NodeEvent::Internal(_) => unreachable!(), + NodeEvent::Accept(_) => unreachable!(), + } + walproposer::walproposer::WaitResult::Latch + } + Some(index) => walproposer::walproposer::WaitResult::Network( + self.sk_ptrs[index], + WL_SOCKET_READABLE, + ), + } + } +} + +/// This struct handles all calls from walproposer into walproposer_api. +pub struct SimulationApi { + os: NodeOs, + safekeepers: RefCell>, + disk: Arc, + redo_start_lsn: Option, + last_logged_commit_lsn: u64, + shmem: UnsafeCell, + config: Config, + event_set: RefCell>, +} + +pub struct Args { + pub os: NodeOs, + pub config: Config, + pub disk: Arc, + pub redo_start_lsn: Option, +} + +impl SimulationApi { + pub fn new(args: Args) -> Self { + // initialize connection state for each safekeeper + let sk_conns = args + .config + .safekeepers_list + .iter() + .map(|s| { + SafekeeperConn::new( + s.split(':').next().unwrap().to_string(), + s.split(':').nth(1).unwrap().to_string(), + ) + }) + .collect::>(); + + Self { + os: args.os, + safekeepers: RefCell::new(sk_conns), + disk: args.disk, + redo_start_lsn: args.redo_start_lsn, + last_logged_commit_lsn: 0, + shmem: UnsafeCell::new(walproposer::api_bindings::empty_shmem()), + config: args.config, + event_set: RefCell::new(None), + } + } + + /// Get SafekeeperConn for the given Safekeeper. + fn get_conn(&self, sk: &mut walproposer::bindings::Safekeeper) -> RefMut<'_, SafekeeperConn> { + let sk_port = unsafe { CStr::from_ptr(sk.port).to_str().unwrap() }; + let state = self.safekeepers.borrow_mut(); + RefMut::map(state, |v| { + v.iter_mut() + .find(|conn| conn.port == sk_port) + .expect("safekeeper conn not found by port") + }) + } +} + +impl ApiImpl for SimulationApi { + fn get_current_timestamp(&self) -> i64 { + debug!("get_current_timestamp"); + // PG TimestampTZ is microseconds, but simulation unit is assumed to be + // milliseconds, so add 10^3 + self.os.now() as i64 * 1000 + } + + fn update_donor(&self, donor: &mut walproposer::bindings::Safekeeper, donor_lsn: u64) { + let mut shmem = unsafe { *self.get_shmem_state() }; + shmem.propEpochStartLsn.value = donor_lsn; + shmem.donor_conninfo = donor.conninfo; + } + + fn conn_status( + &self, + _: &mut walproposer::bindings::Safekeeper, + ) -> walproposer::bindings::WalProposerConnStatusType { + debug!("conn_status"); + // break the connection with a 10% chance + if self.os.random(100) < 10 { + walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_BAD + } else { + walproposer::bindings::WalProposerConnStatusType_WP_CONNECTION_OK + } + } + + fn conn_connect_start(&self, sk: &mut walproposer::bindings::Safekeeper) { + debug!("conn_connect_start"); + let mut conn = self.get_conn(sk); + + assert!(conn.socket.is_none()); + let socket = self.os.open_tcp(conn.node_id); + conn.socket = Some(socket); + conn.raw_ptr = sk; + conn.is_connecting = true; + } + + fn conn_connect_poll( + &self, + _: &mut walproposer::bindings::Safekeeper, + ) -> walproposer::bindings::WalProposerConnectPollStatusType { + debug!("conn_connect_poll"); + // TODO: break the connection here + walproposer::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK + } + + fn conn_send_query(&self, sk: &mut walproposer::bindings::Safekeeper, query: &str) -> bool { + debug!("conn_send_query: {}", query); + self.get_conn(sk).is_start_wal_push = true; + true + } + + fn conn_get_query_result( + &self, + _: &mut walproposer::bindings::Safekeeper, + ) -> walproposer::bindings::WalProposerExecStatusType { + debug!("conn_get_query_result"); + // TODO: break the connection here + walproposer::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH + } + + fn conn_async_read( + &self, + sk: &mut walproposer::bindings::Safekeeper, + vec: &mut Vec, + ) -> walproposer::bindings::PGAsyncReadResult { + debug!("conn_async_read"); + let mut conn = self.get_conn(sk); + + let socket = if let Some(socket) = conn.socket.as_mut() { + socket + } else { + // socket is already closed + return walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL; + }; + + let msg = socket.recv_chan().try_recv(); + + match msg { + None => { + // no message is ready + walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_TRY_AGAIN + } + Some(NetEvent::Closed) => { + // connection is closed + debug!("conn_async_read: connection is closed"); + conn.socket = None; + walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_FAIL + } + Some(NetEvent::Message(msg)) => { + // got a message + let b = match msg { + desim::proto::AnyMessage::Bytes(b) => b, + _ => unreachable!(), + }; + vec.extend_from_slice(&b); + walproposer::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS + } + } + } + + fn conn_blocking_write(&self, sk: &mut walproposer::bindings::Safekeeper, buf: &[u8]) -> bool { + let mut conn = self.get_conn(sk); + debug!("conn_blocking_write to {}: {:?}", conn.node_id, buf); + let socket = conn.socket.as_mut().unwrap(); + socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf))); + true + } + + fn conn_async_write( + &self, + sk: &mut walproposer::bindings::Safekeeper, + buf: &[u8], + ) -> walproposer::bindings::PGAsyncWriteResult { + let mut conn = self.get_conn(sk); + debug!("conn_async_write to {}: {:?}", conn.node_id, buf); + if let Some(socket) = conn.socket.as_mut() { + socket.send(desim::proto::AnyMessage::Bytes(Bytes::copy_from_slice(buf))); + } else { + // connection is already closed + debug!("conn_async_write: writing to a closed socket!"); + // TODO: maybe we should return error here? + } + walproposer::bindings::PGAsyncWriteResult_PG_ASYNC_WRITE_SUCCESS + } + + fn wal_reader_allocate(&self, _: &mut walproposer::bindings::Safekeeper) -> NeonWALReadResult { + debug!("wal_reader_allocate"); + walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS + } + + fn wal_read( + &self, + _sk: &mut walproposer::bindings::Safekeeper, + buf: &mut [u8], + startpos: u64, + ) -> NeonWALReadResult { + self.disk.lock().read(startpos, buf); + walproposer::bindings::NeonWALReadResult_NEON_WALREAD_SUCCESS + } + + fn init_event_set(&self, _: &mut walproposer::bindings::WalProposer) { + debug!("init_event_set"); + let new_event_set = EventSet::new(self.os.clone()); + let old_event_set = self.event_set.replace(Some(new_event_set)); + assert!(old_event_set.is_none()); + } + + fn update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper, event_mask: u32) { + debug!( + "update_event_set, sk={:?}, events_mask={:#b}", + sk as *mut walproposer::bindings::Safekeeper, event_mask + ); + let conn = self.get_conn(sk); + + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .update_event_set(&conn, event_mask); + } + + fn add_safekeeper_event_set( + &self, + sk: &mut walproposer::bindings::Safekeeper, + event_mask: u32, + ) { + debug!( + "add_safekeeper_event_set, sk={:?}, events_mask={:#b}", + sk as *mut walproposer::bindings::Safekeeper, event_mask + ); + + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .add_safekeeper(&self.get_conn(sk), event_mask); + } + + fn rm_safekeeper_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) { + debug!( + "rm_safekeeper_event_set, sk={:?}", + sk as *mut walproposer::bindings::Safekeeper, + ); + + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .remove_safekeeper(&self.get_conn(sk)); + } + + fn active_state_update_event_set(&self, sk: &mut walproposer::bindings::Safekeeper) { + debug!("active_state_update_event_set"); + + assert!(sk.state == walproposer::bindings::SafekeeperState_SS_ACTIVE); + self.event_set + .borrow_mut() + .as_mut() + .unwrap() + .refresh_event_set(); + } + + fn wal_reader_events(&self, _sk: &mut walproposer::bindings::Safekeeper) -> u32 { + 0 + } + + fn wait_event_set( + &self, + _: &mut walproposer::bindings::WalProposer, + timeout_millis: i64, + ) -> walproposer::walproposer::WaitResult { + // TODO: handle multiple stages as part of the simulation (e.g. connect, start_wal_push, etc) + let mut conns = self.safekeepers.borrow_mut(); + for conn in conns.iter_mut() { + if conn.socket.is_some() && conn.is_connecting { + conn.is_connecting = false; + debug!("wait_event_set, connecting to {}:{}", conn.host, conn.port); + return walproposer::walproposer::WaitResult::Network( + conn.raw_ptr, + WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE, + ); + } + if conn.socket.is_some() && conn.is_start_wal_push { + conn.is_start_wal_push = false; + debug!( + "wait_event_set, start wal push to {}:{}", + conn.host, conn.port + ); + return walproposer::walproposer::WaitResult::Network( + conn.raw_ptr, + WL_SOCKET_READABLE, + ); + } + } + drop(conns); + + let res = self + .event_set + .borrow_mut() + .as_mut() + .unwrap() + .wait(timeout_millis); + + debug!( + "wait_event_set, timeout_millis={}, res={:?}", + timeout_millis, res, + ); + res + } + + fn strong_random(&self, buf: &mut [u8]) -> bool { + debug!("strong_random"); + buf.fill(0); + true + } + + fn finish_sync_safekeepers(&self, lsn: u64) { + debug!("finish_sync_safekeepers, lsn={}", lsn); + executor::exit(0, Lsn(lsn).to_string()); + } + + fn log_internal(&self, _wp: &mut walproposer::bindings::WalProposer, level: Level, msg: &str) { + debug!("wp_log[{}] {}", level, msg); + if level == Level::Fatal || level == Level::Panic { + if msg.contains("rejects our connection request with term") { + // collected quorum with lower term, then got rejected by next connected safekeeper + executor::exit(1, msg.to_owned()); + } + if msg.contains("collected propEpochStartLsn") && msg.contains(", but basebackup LSN ") + { + // sync-safekeepers collected wrong quorum, walproposer collected another quorum + executor::exit(1, msg.to_owned()); + } + if msg.contains("failed to download WAL for logical replicaiton") { + // Recovery connection broken and recovery was failed + executor::exit(1, msg.to_owned()); + } + if msg.contains("missing majority of votes, collected") { + // Voting bug when safekeeper disconnects after voting + executor::exit(1, msg.to_owned()); + } + panic!("unknown FATAL error from walproposer: {}", msg); + } + } + + fn after_election(&self, wp: &mut walproposer::bindings::WalProposer) { + let prop_lsn = wp.propEpochStartLsn; + let prop_term = wp.propTerm; + + let mut prev_lsn: u64 = 0; + let mut prev_term: u64 = 0; + + unsafe { + let history = wp.propTermHistory.entries; + let len = wp.propTermHistory.n_entries as usize; + if len > 1 { + let entry = *history.wrapping_add(len - 2); + prev_lsn = entry.lsn; + prev_term = entry.term; + } + } + + let msg = format!( + "prop_elected;{};{};{};{}", + prop_lsn, prop_term, prev_lsn, prev_term + ); + + debug!(msg); + self.os.log_event(msg); + } + + fn get_redo_start_lsn(&self) -> u64 { + debug!("get_redo_start_lsn -> {:?}", self.redo_start_lsn); + self.redo_start_lsn.expect("redo_start_lsn is not set").0 + } + + fn get_shmem_state(&self) -> *mut walproposer::bindings::WalproposerShmemState { + self.shmem.get() + } + + fn start_streaming( + &self, + startpos: u64, + callback: &walproposer::walproposer::StreamingCallback, + ) { + let disk = &self.disk; + let disk_lsn = disk.lock().flush_rec_ptr().0; + debug!("start_streaming at {} (disk_lsn={})", startpos, disk_lsn); + if startpos < disk_lsn { + debug!("startpos < disk_lsn, it means we wrote some transaction even before streaming started"); + } + assert!(startpos <= disk_lsn); + let mut broadcasted = Lsn(startpos); + + loop { + let available = disk.lock().flush_rec_ptr(); + assert!(available >= broadcasted); + callback.broadcast(broadcasted, available); + broadcasted = available; + callback.poll(); + } + } + + fn process_safekeeper_feedback( + &mut self, + wp: &mut walproposer::bindings::WalProposer, + _sk: &mut walproposer::bindings::Safekeeper, + ) { + debug!("process_safekeeper_feedback, commit_lsn={}", wp.commitLsn); + if wp.commitLsn > self.last_logged_commit_lsn { + self.os.log_event(format!("commit_lsn;{}", wp.commitLsn)); + self.last_logged_commit_lsn = wp.commitLsn; + } + } + + fn get_flush_rec_ptr(&self) -> u64 { + let lsn = self.disk.lock().flush_rec_ptr(); + debug!("get_flush_rec_ptr: {}", lsn); + lsn.0 + } + + fn recovery_download( + &self, + wp: &mut walproposer::bindings::WalProposer, + sk: &mut walproposer::bindings::Safekeeper, + ) -> bool { + let mut startpos = wp.truncateLsn; + let endpos = wp.propEpochStartLsn; + + if startpos == endpos { + debug!("recovery_download: nothing to download"); + return true; + } + + debug!("recovery_download from {} to {}", startpos, endpos,); + + let replication_prompt = format!( + "START_REPLICATION {} {} {} {}", + self.config.ttid.tenant_id, self.config.ttid.timeline_id, startpos, endpos, + ); + let async_conn = self.get_conn(sk); + + let conn = self.os.open_tcp(async_conn.node_id); + conn.send(desim::proto::AnyMessage::Bytes(replication_prompt.into())); + + let chan = conn.recv_chan(); + while startpos < endpos { + let event = chan.recv(); + match event { + NetEvent::Closed => { + debug!("connection closed in recovery"); + break; + } + NetEvent::Message(AnyMessage::Bytes(b)) => { + debug!("got recovery bytes from safekeeper"); + self.disk.lock().write(startpos, &b); + startpos += b.len() as u64; + } + NetEvent::Message(_) => unreachable!(), + } + } + + debug!("recovery finished at {}", startpos); + + startpos == endpos + } + + fn conn_finish(&self, sk: &mut walproposer::bindings::Safekeeper) { + let mut conn = self.get_conn(sk); + debug!("conn_finish to {}", conn.node_id); + if let Some(socket) = conn.socket.as_mut() { + socket.close(); + } else { + // connection is already closed + } + conn.socket = None; + } + + fn conn_error_message(&self, _sk: &mut walproposer::bindings::Safekeeper) -> String { + "connection is closed, probably".into() + } +} diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs new file mode 100644 index 0000000000..aa329bd2f0 --- /dev/null +++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs @@ -0,0 +1,314 @@ +use std::{ffi::CString, sync::Arc}; + +use byteorder::{LittleEndian, WriteBytesExt}; +use crc32c::crc32c_append; +use parking_lot::{Mutex, MutexGuard}; +use postgres_ffi::{ + pg_constants::{ + RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG, + XLR_BLOCK_ID_DATA_SHORT, + }, + v16::{ + wal_craft_test_export::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC}, + xlog_utils::{ + XLogSegNoOffsetToRecPtr, XlLogicalMessage, XLOG_RECORD_CRC_OFFS, + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, + XLP_FIRST_IS_CONTRECORD, + }, + XLogRecord, + }, + WAL_SEGMENT_SIZE, XLOG_BLCKSZ, +}; +use utils::lsn::Lsn; + +use super::block_storage::BlockStorage; + +/// Simulation implementation of walproposer WAL storage. +pub struct DiskWalProposer { + state: Mutex, +} + +impl DiskWalProposer { + pub fn new() -> Arc { + Arc::new(DiskWalProposer { + state: Mutex::new(State { + internal_available_lsn: Lsn(0), + prev_lsn: Lsn(0), + disk: BlockStorage::new(), + }), + }) + } + + pub fn lock(&self) -> MutexGuard { + self.state.lock() + } +} + +pub struct State { + // flush_lsn + internal_available_lsn: Lsn, + // needed for WAL generation + prev_lsn: Lsn, + // actual WAL storage + disk: BlockStorage, +} + +impl State { + pub fn read(&self, pos: u64, buf: &mut [u8]) { + self.disk.read(pos, buf); + // TODO: fail on reading uninitialized data + } + + pub fn write(&mut self, pos: u64, buf: &[u8]) { + self.disk.write(pos, buf); + } + + /// Update the internal available LSN to the given value. + pub fn reset_to(&mut self, lsn: Lsn) { + self.internal_available_lsn = lsn; + } + + /// Get current LSN. + pub fn flush_rec_ptr(&self) -> Lsn { + self.internal_available_lsn + } + + /// Generate a new WAL record at the current LSN. + pub fn insert_logical_message(&mut self, prefix: &str, msg: &[u8]) -> anyhow::Result<()> { + let prefix_cstr = CString::new(prefix)?; + let prefix_bytes = prefix_cstr.as_bytes_with_nul(); + + let lm = XlLogicalMessage { + db_id: 0, + transactional: 0, + prefix_size: prefix_bytes.len() as ::std::os::raw::c_ulong, + message_size: msg.len() as ::std::os::raw::c_ulong, + }; + + let record_bytes = lm.encode(); + let rdatas: Vec<&[u8]> = vec![&record_bytes, prefix_bytes, msg]; + insert_wal_record(self, rdatas, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE) + } +} + +fn insert_wal_record( + state: &mut State, + rdatas: Vec<&[u8]>, + rmid: u8, + info: u8, +) -> anyhow::Result<()> { + // bytes right after the header, in the same rdata block + let mut scratch = Vec::new(); + let mainrdata_len: usize = rdatas.iter().map(|rdata| rdata.len()).sum(); + + if mainrdata_len > 0 { + if mainrdata_len > 255 { + scratch.push(XLR_BLOCK_ID_DATA_LONG); + // TODO: verify endiness + let _ = scratch.write_u32::(mainrdata_len as u32); + } else { + scratch.push(XLR_BLOCK_ID_DATA_SHORT); + scratch.push(mainrdata_len as u8); + } + } + + let total_len: u32 = (XLOG_SIZE_OF_XLOG_RECORD + scratch.len() + mainrdata_len) as u32; + let size = maxalign(total_len); + assert!(size as usize > XLOG_SIZE_OF_XLOG_RECORD); + + let start_bytepos = recptr_to_bytepos(state.internal_available_lsn); + let end_bytepos = start_bytepos + size as u64; + + let start_recptr = bytepos_to_recptr(start_bytepos); + let end_recptr = bytepos_to_recptr(end_bytepos); + + assert!(recptr_to_bytepos(start_recptr) == start_bytepos); + assert!(recptr_to_bytepos(end_recptr) == end_bytepos); + + let mut crc = crc32c_append(0, &scratch); + for rdata in &rdatas { + crc = crc32c_append(crc, rdata); + } + + let mut header = XLogRecord { + xl_tot_len: total_len, + xl_xid: 0, + xl_prev: state.prev_lsn.0, + xl_info: info, + xl_rmid: rmid, + __bindgen_padding_0: [0u8; 2usize], + xl_crc: crc, + }; + + // now we have the header and can finish the crc + let header_bytes = header.encode()?; + let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]); + header.xl_crc = crc; + + let mut header_bytes = header.encode()?.to_vec(); + assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_RECORD); + + header_bytes.extend_from_slice(&scratch); + + // finish rdatas + let mut rdatas = rdatas; + rdatas.insert(0, &header_bytes); + + write_walrecord_to_disk(state, total_len as u64, rdatas, start_recptr, end_recptr)?; + + state.internal_available_lsn = end_recptr; + state.prev_lsn = start_recptr; + Ok(()) +} + +fn write_walrecord_to_disk( + state: &mut State, + total_len: u64, + rdatas: Vec<&[u8]>, + start: Lsn, + end: Lsn, +) -> anyhow::Result<()> { + let mut curr_ptr = start; + let mut freespace = insert_freespace(curr_ptr); + let mut written: usize = 0; + + assert!(freespace >= std::mem::size_of::()); + + for mut rdata in rdatas { + while rdata.len() >= freespace { + assert!( + curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD + || freespace == 0 + ); + + state.write(curr_ptr.0, &rdata[..freespace]); + rdata = &rdata[freespace..]; + written += freespace; + curr_ptr = Lsn(curr_ptr.0 + freespace as u64); + + let mut new_page = XLogPageHeaderData { + xlp_magic: XLOG_PAGE_MAGIC as u16, + xlp_info: XLP_BKP_REMOVABLE, + xlp_tli: 1, + xlp_pageaddr: curr_ptr.0, + xlp_rem_len: (total_len - written as u64) as u32, + ..Default::default() // Put 0 in padding fields. + }; + if new_page.xlp_rem_len > 0 { + new_page.xlp_info |= XLP_FIRST_IS_CONTRECORD; + } + + if curr_ptr.segment_offset(WAL_SEGMENT_SIZE) == 0 { + new_page.xlp_info |= XLP_LONG_HEADER; + let long_page = XLogLongPageHeaderData { + std: new_page, + xlp_sysid: 0, + xlp_seg_size: WAL_SEGMENT_SIZE as u32, + xlp_xlog_blcksz: XLOG_BLCKSZ as u32, + }; + let header_bytes = long_page.encode()?; + assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_LONG_PHD); + state.write(curr_ptr.0, &header_bytes); + curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64); + } else { + let header_bytes = new_page.encode()?; + assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_SHORT_PHD); + state.write(curr_ptr.0, &header_bytes); + curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64); + } + freespace = insert_freespace(curr_ptr); + } + + assert!( + curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD + || rdata.is_empty() + ); + state.write(curr_ptr.0, rdata); + curr_ptr = Lsn(curr_ptr.0 + rdata.len() as u64); + written += rdata.len(); + freespace -= rdata.len(); + } + + assert!(written == total_len as usize); + curr_ptr.0 = maxalign(curr_ptr.0); + assert!(curr_ptr == end); + Ok(()) +} + +fn maxalign(size: T) -> T +where + T: std::ops::BitAnd + + std::ops::Add + + std::ops::Not + + From, +{ + (size + T::from(7)) & !T::from(7) +} + +fn insert_freespace(ptr: Lsn) -> usize { + if ptr.block_offset() == 0 { + 0 + } else { + (XLOG_BLCKSZ as u64 - ptr.block_offset()) as usize + } +} + +const XLP_BKP_REMOVABLE: u16 = 0x0004; +const USABLE_BYTES_IN_PAGE: u64 = (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64; +const USABLE_BYTES_IN_SEGMENT: u64 = ((WAL_SEGMENT_SIZE / XLOG_BLCKSZ) as u64 + * USABLE_BYTES_IN_PAGE) + - (XLOG_SIZE_OF_XLOG_RECORD - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64; + +fn bytepos_to_recptr(bytepos: u64) -> Lsn { + let fullsegs = bytepos / USABLE_BYTES_IN_SEGMENT; + let mut bytesleft = bytepos % USABLE_BYTES_IN_SEGMENT; + + let seg_offset = if bytesleft < (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 { + // fits on first page of segment + bytesleft + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + } else { + // account for the first page on segment with long header + bytesleft -= (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64; + let fullpages = bytesleft / USABLE_BYTES_IN_PAGE; + bytesleft %= USABLE_BYTES_IN_PAGE; + + XLOG_BLCKSZ as u64 + + fullpages * XLOG_BLCKSZ as u64 + + bytesleft + + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + }; + + Lsn(XLogSegNoOffsetToRecPtr( + fullsegs, + seg_offset as u32, + WAL_SEGMENT_SIZE, + )) +} + +fn recptr_to_bytepos(ptr: Lsn) -> u64 { + let fullsegs = ptr.segment_number(WAL_SEGMENT_SIZE); + let offset = ptr.segment_offset(WAL_SEGMENT_SIZE) as u64; + + let fullpages = offset / XLOG_BLCKSZ as u64; + let offset = offset % XLOG_BLCKSZ as u64; + + if fullpages == 0 { + fullsegs * USABLE_BYTES_IN_SEGMENT + + if offset > 0 { + assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + } else { + 0 + } + } else { + fullsegs * USABLE_BYTES_IN_SEGMENT + + (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 + + (fullpages - 1) * USABLE_BYTES_IN_PAGE + + if offset > 0 { + assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64); + offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 + } else { + 0 + } + } +} diff --git a/scripts/benchmark_durations.py b/scripts/benchmark_durations.py index 7f05d72a03..01f34a1b96 100755 --- a/scripts/benchmark_durations.py +++ b/scripts/benchmark_durations.py @@ -20,7 +20,7 @@ BENCHMARKS_DURATION_QUERY = """ FROM results WHERE started_at > CURRENT_DATE - INTERVAL '%s' day - AND parent_suite = 'test_runner.performance' + AND starts_with(parent_suite, 'test_runner.performance') AND status = 'passed' GROUP BY parent_suite, suite, name @@ -31,68 +31,75 @@ BENCHMARKS_DURATION_QUERY = """ # the total duration varies from 8 to 40 minutes. # We use some pre-collected durations as a fallback to have a better distribution. FALLBACK_DURATION = { - "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144, - "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941, - "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053, - "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67, - "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497, - "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852, - "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572, - "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474, - "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262, - "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225, - "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159, - "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719, - "test_runner/performance/test_compaction.py::test_compaction": 110.222, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147, - "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321, - "test_runner/performance/test_copy.py::test_copy[neon]": 16.579, - "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094, - "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157, - "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102, - "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677, - "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079, - "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119, - "test_runner/performance/test_layer_map.py::test_layer_map": 24.784, - "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235, - "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586, - "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536, - "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753, - "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975, - "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522, - "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189, - "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899, - "test_runner/performance/test_startup.py::test_startup_simple": 2.51, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46, - "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583, - "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282, - "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704, - "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742, + "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135, + "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104, + "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073, + "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.759, + "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 6.885, + "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 8.758, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 18.275, + "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 9.533, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 12.09, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 35.145, + "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 22.28, + "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.353, + "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 75.487, + "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 54.142, + "test_runner/performance/test_compaction.py::test_compaction": 110.715, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.68, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.384, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.315, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.783, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.647, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 17.04, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.01, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.902, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.077, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.4, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.33, + "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.434, + "test_runner/performance/test_copy.py::test_copy[neon]": 13.817, + "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736, + "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868, + "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588, + "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 30.849, + "test_runner/performance/test_layer_map.py::test_layer_map": 39.378, + "test_runner/performance/test_lazy_startup.py::test_lazy_startup": 2848.938, + "test_runner/performance/test_logical_replication.py::test_logical_replication": 120.952, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 35.552, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 66.762, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 85.177, + "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 92.12, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 107.009, + "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.582, + "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 4.737, + "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.686, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.271, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 50.719, + "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 15.992, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.566, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 13.542, + "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.35, + "test_runner/performance/test_startup.py::test_startup_simple": 13.043, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 194.841, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 286.667, + "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 85.577, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 297.626, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 646.187, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 989.776, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 125.638, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 123.554, + "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 190.083, + "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 21.016, + "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 23.028, } diff --git a/scripts/check_allowed_errors.sh b/scripts/check_allowed_errors.sh new file mode 100755 index 0000000000..87e52c1e64 --- /dev/null +++ b/scripts/check_allowed_errors.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -eu + +HELPER_DIR="$(dirname "${BASH_SOURCE[0]}")" +SCRIPT="test_runner/fixtures/pageserver/allowed_errors.py" + +# first run to understand all of the errors: +# +# example: ./scripts/check_allowed_errors.sh -i - < pageserver.log +# example: ./scripts/check_allowed_errors.sh -i pageserver.log +# +# then edit the test local allowed_errors to the +# test_runner/fixtures/pageserver/allowed_errors.py, then re-run to make sure +# they are handled. +# +# finally revert any local changes to allowed_errors.py. +poetry run python3 "$HELPER_DIR/../$SCRIPT" $* diff --git a/scripts/comment-test-report.js b/scripts/comment-test-report.js index 89befda71f..f42262cf48 100755 --- a/scripts/comment-test-report.js +++ b/scripts/comment-test-report.js @@ -188,7 +188,7 @@ const reportSummary = async (params) => { } const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => { - let summary = `\n### Code coverage ([full report](${coverageUrl}))\n` + let summary = `\n### Code coverage* ([full report](${coverageUrl}))\n` const coverage = await (await fetch(summaryJsonUrl)).json() for (const covType of Object.keys(coverage).sort()) { @@ -198,7 +198,7 @@ const parseCoverageSummary = async ({ summaryJsonUrl, coverageUrl, fetch }) => { summary += `- \`${covType}s\`: \`${coverage[covType]["_summary"]}\`\n` } - + summary += "\n\\* collected from Rust tests only\n" summary += `\n___\n` return summary diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py deleted file mode 100755 index 980f343047..0000000000 --- a/scripts/export_import_between_pageservers.py +++ /dev/null @@ -1,736 +0,0 @@ -# -# Script to export tenants from one pageserver and import them into another page server. -# -# Outline of steps: -# 1. Get `(last_lsn, prev_lsn)` from old pageserver -# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file -# 3. This tar file might be missing relation files for empty relations, if the pageserver -# is old enough (we didn't always store those). So to recreate them, we start a local -# vanilla postgres on this basebackup and ask it what relations should exist, then touch -# any missing files and re-pack the tar. -# TODO This functionality is no longer needed, so we can delete it later if we don't -# end up using the same utils for the pg 15 upgrade. Not sure. -# 4. We import the patched basebackup into a new pageserver -# 5. We export again via fullbackup, now from the new pageserver and compare the returned -# tar file with the one we imported. This confirms that we imported everything that was -# exported, but doesn't guarantee correctness (what if we didn't **export** everything -# initially?) -# 6. We wait for the new pageserver's remote_consistent_lsn to catch up -# -# For more context on how to use this, see: -# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf - -import argparse -import os -import shutil -import subprocess -import tempfile -import time -import uuid -from contextlib import closing -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, cast - -import psycopg2 -import requests -from psycopg2.extensions import connection as PgConnection -from psycopg2.extensions import parse_dsn - -############################################### -### client-side utils copied from test fixtures -############################################### - -Env = Dict[str, str] - -_global_counter = 0 - - -def global_counter() -> int: - """A really dumb global counter. - This is useful for giving output files a unique number, so if we run the - same command multiple times we can keep their output separate. - """ - global _global_counter - _global_counter += 1 - return _global_counter - - -def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: - """Run a process and capture its output - Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr" - where "cmd" is the name of the program and NNN is an incrementing - counter. - If those files already exist, we will overwrite them. - Returns basepath for files with captured output. - """ - assert isinstance(cmd, list) - base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) - basepath = os.path.join(capture_dir, base) - stdout_filename = basepath + ".stdout" - stderr_filename = basepath + ".stderr" - - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: - print('(capturing output to "{}.stdout")'.format(base)) - subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) - - return basepath - - -class PgBin: - """A helper class for executing postgres binaries""" - - def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): - self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") - self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join( - str(pg_distrib_dir), "v{}".format(pg_version), "lib" - ) - - def _fixpath(self, command: List[str]): - if "/" not in command[0]: - command[0] = os.path.join(self.pg_bin_path, command[0]) - - def _build_env(self, env_add: Optional[Env]) -> Env: - if env_add is None: - return self.env - env = self.env.copy() - env.update(env_add) - return env - - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): - """ - Run one of the postgres binaries. - The command should be in list form, e.g. ['pgbench', '-p', '55432'] - All the necessary environment variables will be set. - If the first argument (the command name) doesn't include a path (no '/' - characters present), then it will be edited to include the correct path. - If you want stdout/stderr captured to files, use `run_capture` instead. - """ - - self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) - env = self._build_env(env) - subprocess.run(command, env=env, cwd=cwd, check=True) - - def run_capture( - self, - command: List[str], - env: Optional[Env] = None, - cwd: Optional[str] = None, - **kwargs: Any, - ) -> str: - """ - Run one of the postgres binaries, with stderr and stdout redirected to a file. - This is just like `run`, but for chatty programs. Returns basepath for files - with captured output. - """ - - self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) - env = self._build_env(env) - return subprocess_capture( - str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs - ) - - -class PgProtocol: - """Reusable connection logic""" - - def __init__(self, **kwargs): - self.default_options = kwargs - - def conn_options(self, **kwargs): - conn_options = self.default_options.copy() - if "dsn" in kwargs: - conn_options.update(parse_dsn(kwargs["dsn"])) - conn_options.update(kwargs) - - # Individual statement timeout in seconds. 2 minutes should be - # enough for our tests, but if you need a longer, you can - # change it by calling "SET statement_timeout" after - # connecting. - conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}" - - return conn_options - - # autocommit=True here by default because that's what we need most of the time - def connect(self, autocommit=True, **kwargs) -> PgConnection: - """ - Connect to the node. - Returns psycopg2's connection object. - This method passes all extra params to connstr. - """ - conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs)) - - # WARNING: this setting affects *all* tests! - conn.autocommit = autocommit - return conn - - def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]: - """ - Execute query against the node and return all rows. - This method passes all extra params to connstr. - """ - return self.safe_psql_many([query], **kwargs)[0] - - def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]: - """ - Execute queries against the node and return all rows. - This method passes all extra params to connstr. - """ - result: List[List[Any]] = [] - with closing(self.connect(**kwargs)) as conn: - with conn.cursor() as cur: - for query in queries: - print(f"Executing query: {query}") - cur.execute(query) - - if cur.description is None: - result.append([]) # query didn't return data - else: - result.append(cast(List[Any], cur.fetchall())) - return result - - -class VanillaPostgres(PgProtocol): - def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True): - super().__init__(host="localhost", port=port, dbname="postgres") - self.pgdatadir = pgdatadir - self.pg_bin = pg_bin - self.running = False - if init: - self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)]) - self.configure([f"port = {port}\n"]) - - def configure(self, options: List[str]): - """Append lines into postgresql.conf file.""" - assert not self.running - with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file: - conf_file.write("\n".join(options)) - - def start(self, log_path: Optional[str] = None): - assert not self.running - self.running = True - - log_path = log_path or os.path.join(self.pgdatadir, "pg.log") - - self.pg_bin.run_capture( - ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"] - ) - - def stop(self): - assert self.running - self.running = False - self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"]) - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc, tb): - if self.running: - self.stop() - - -class NeonPageserverApiException(Exception): - pass - - -class NeonPageserverHttpClient(requests.Session): - def __init__(self, host, port): - super().__init__() - self.host = host - self.port = port - - def verbose_error(self, res: requests.Response): - try: - res.raise_for_status() - except requests.RequestException as e: - try: - msg = res.json()["msg"] - except: # noqa: E722 - msg = "" - raise NeonPageserverApiException(msg) from e - - def check_status(self): - self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status() - - def tenant_list(self): - res = self.get(f"http://{self.host}:{self.port}/v1/tenant") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists): - res = self.post( - f"http://{self.host}:{self.port}/v1/tenant", - json={"new_tenant_id": new_tenant_id.hex, "generation": 1}, - ) - - if res.status_code == 409: - if ok_if_exists: - print(f"could not create tenant: already exists for id {new_tenant_id}") - else: - res.raise_for_status() - elif res.status_code == 201: - print(f"created tenant {new_tenant_id}") - else: - self.verbose_error(res) - - return new_tenant_id - - def timeline_list(self, tenant_id: uuid.UUID): - res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true" - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - -def lsn_to_hex(num: int) -> str: - """Convert lsn from int to standard hex notation.""" - return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) - - -def lsn_from_hex(lsn_hex: str) -> int: - """Convert lsn from hex notation to int.""" - left, right = lsn_hex.split("/") - return (int(left, 16) << 32) + int(right, 16) - - -def remote_consistent_lsn( - pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID -) -> int: - detail = pageserver_http_client.timeline_detail(tenant, timeline) - - lsn_str = detail["remote_consistent_lsn"] - assert isinstance(lsn_str, str) - return lsn_from_hex(lsn_str) - - -def wait_for_upload( - pageserver_http_client: NeonPageserverHttpClient, - tenant: uuid.UUID, - timeline: uuid.UUID, - lsn: int, -): - """waits for local timeline upload up to specified lsn""" - for i in range(10): - current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) - if current_lsn >= lsn: - return - print( - "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 - ) - ) - time.sleep(1) - - raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn) - ) - ) - - -############## -# End of utils -############## - - -def pack_base(log_dir, restored_dir, output_tar): - """Create tar file from basebackup, being careful to produce relative filenames.""" - tmp_tar_name = "tmp.tar" - tmp_tar_path = os.path.join(restored_dir, tmp_tar_name) - cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir) - # We actually cd into the dir and call tar from there. If we call tar from - # outside we won't encode filenames as relative, and they won't parse well - # on import. - subprocess_capture(log_dir, cmd, cwd=restored_dir) - shutil.move(tmp_tar_path, output_tar) - - -def reconstruct_paths(log_dir, pg_bin, base_tar, port: int): - """Reconstruct what relation files should exist in the datadir by querying postgres.""" - with tempfile.TemporaryDirectory() as restored_dir: - # Unpack the base tar - subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir]) - - # Start a vanilla postgres from the given datadir and query it to find - # what relfiles should exist, but possibly don't. - with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg: - vanilla_pg.configure([f"port={port}"]) - vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log")) - - # Create database based on template0 because we can't connect to template0 - query = "create database template0copy template template0" - vanilla_pg.safe_psql(query, user="cloud_admin") - vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin") - - # Get all databases - query = "select oid, datname from pg_database" - oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin") - template0_oid = [ - oid for (oid, database) in oid_dbname_pairs if database == "template0" - ][0] - - # Get rel paths for each database - for oid, database in oid_dbname_pairs: - if database == "template0": - # We can't connect to template0 - continue - - query = "select relname, pg_relation_filepath(oid) from pg_class" - result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database) - for _relname, filepath in result: - if filepath is not None: - if database == "template0copy": - # Add all template0copy paths to template0 - prefix = f"base/{oid}/" - if filepath.startswith(prefix): - suffix = filepath[len(prefix) :] - yield f"base/{template0_oid}/{suffix}" - elif filepath.startswith("global"): - print(f"skipping {database} global file {filepath}") - else: - raise AssertionError - else: - yield filepath - - -def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths): - """Add the appropriate empty files to a basebadkup tar.""" - with tempfile.TemporaryDirectory() as restored_dir: - # Unpack the base tar - subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir]) - - # Touch files that don't exist - for path in paths: - absolute_path = os.path.join(restored_dir, path) - exists = os.path.exists(absolute_path) - if not exists: - print(f"File {absolute_path} didn't exist. Creating..") - Path(absolute_path).touch() - - # Repackage - pack_base(log_dir, restored_dir, output_tar) - - -# HACK This is a workaround for exporting from old pageservers that -# can't export empty relations. In this case we need to start -# a vanilla postgres from the exported datadir, and query it -# to see what empty relations are missing, and then create -# those empty files before importing. -def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int): - reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port)) - touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths) - - -def get_rlsn(pageserver_connstr, tenant_id, timeline_id): - with closing(psycopg2.connect(pageserver_connstr)) as conn: - conn.autocommit = True - with conn.cursor() as cur: - cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}" - cur.execute(cmd) - res = cur.fetchone() - assert res is not None - prev_lsn = res[0] - last_lsn = res[1] - - return last_lsn, prev_lsn - - -def import_timeline( - args, - psql_path, - pageserver_connstr, - pageserver_http, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename, - pg_version, -): - # Import timelines to new pageserver - import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}" - full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """ - - stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr") - stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout") - - print(f"Running: {full_cmd}") - - with open(stdout_filename, "w") as stdout_f: - with open(stderr_filename2, "w") as stderr_f: - print(f"(capturing output to {stdout_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - subprocess.run( - full_cmd, - stdout=stdout_f, - stderr=stderr_f, - env=pg_bin._build_env(None), - shell=True, - check=True, - ) - - print("Done import") - - # Wait until pageserver persists the files - wait_for_upload( - pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn) - ) - - -def export_timeline( - args, - psql_path, - pageserver_connstr, - tenant_id, - timeline_id, - last_lsn, - prev_lsn, - tar_filename, - pg_version, -): - # Choose filenames - incomplete_filename = tar_filename + ".incomplete" - stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr") - - # Construct export command - query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}" - cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query] - - # Run export command - print(f"Running: {cmd}") - with open(incomplete_filename, "w") as stdout_f: - with open(stderr_filename, "w") as stderr_f: - print(f"(capturing output to {incomplete_filename})") - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - subprocess.run( - cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True - ) - - # Add missing rels - pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version) - add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port) - - # Log more info - file_size = os.path.getsize(tar_filename) - print(f"Done export: {tar_filename}, size {file_size}") - - -def main(args: argparse.Namespace): - # any psql version will do here. use current DEFAULT_PG_VERSION = 15 - psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql") - - old_pageserver_host = args.old_pageserver_host - new_pageserver_host = args.new_pageserver_host - - old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port) - old_http_client.check_status() - old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}" - - new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port) - new_http_client.check_status() - new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}" - - for tenant_id in args.tenants: - print(f"Tenant: {tenant_id}") - timelines = old_http_client.timeline_list(uuid.UUID(tenant_id)) - print(f"Timelines: {timelines}") - - # Create tenant in new pageserver - if args.only_import is False and not args.timelines: - new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists) - - for timeline in timelines: - # Skip timelines we don't need to export - if args.timelines and timeline["timeline_id"] not in args.timelines: - print(f"Skipping timeline {timeline['timeline_id']}") - continue - - # Choose filenames - tar_filename = os.path.join( - args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar" - ) - - pg_version = timeline["pg_version"] - - # Export timeline from old pageserver - if args.only_import is False: - last_lsn, prev_lsn = get_rlsn( - old_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - ) - export_timeline( - args, - psql_path, - old_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - tar_filename, - pg_version, - ) - - # Import into new pageserver - import_timeline( - args, - psql_path, - new_pageserver_connstr, - new_http_client, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - tar_filename, - pg_version, - ) - - # Re-export and compare - re_export_filename = tar_filename + ".reexport" - export_timeline( - args, - psql_path, - new_pageserver_connstr, - timeline["tenant_id"], - timeline["timeline_id"], - last_lsn, - prev_lsn, - re_export_filename, - pg_version, - ) - - # Check the size is the same - old_size = (os.path.getsize(tar_filename),) - new_size = (os.path.getsize(re_export_filename),) - if old_size != new_size: - raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}") - - -def non_zero_tcp_port(arg: Any): - port = int(arg) - if port < 1 or port > 65535: - raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}") - return port - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tenant-id", - dest="tenants", - required=True, - nargs="+", - help="Id of the tenant to migrate. You can pass multiple arguments", - ) - parser.add_argument( - "--timeline-id", - dest="timelines", - required=False, - nargs="+", - help="Id of the timeline to migrate. You can pass multiple arguments", - ) - parser.add_argument( - "--from-host", - dest="old_pageserver_host", - required=True, - help="Host of the pageserver to migrate data from", - ) - parser.add_argument( - "--from-http-port", - dest="old_pageserver_http_port", - required=False, - type=int, - default=9898, - help="HTTP port of the pageserver to migrate data from. Default: 9898", - ) - parser.add_argument( - "--from-pg-port", - dest="old_pageserver_pg_port", - required=False, - type=int, - default=6400, - help="pg port of the pageserver to migrate data from. Default: 6400", - ) - parser.add_argument( - "--to-host", - dest="new_pageserver_host", - required=True, - help="Host of the pageserver to migrate data to", - ) - parser.add_argument( - "--to-http-port", - dest="new_pageserver_http_port", - required=False, - default=9898, - type=int, - help="HTTP port of the pageserver to migrate data to. Default: 9898", - ) - parser.add_argument( - "--to-pg-port", - dest="new_pageserver_pg_port", - required=False, - default=6400, - type=int, - help="pg port of the pageserver to migrate data to. Default: 6400", - ) - parser.add_argument( - "--ignore-tenant-exists", - dest="ok_if_exists", - required=False, - help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.", - ) - parser.add_argument( - "--pg-distrib-dir", - dest="pg_distrib_dir", - required=False, - default="/usr/local/", - help="Path where postgres binaries are installed. Default: /usr/local/", - ) - parser.add_argument( - "--psql-path", - dest="psql_path", - required=False, - default="/usr/local/v14/bin/psql", - help="Path to the psql binary. Default: /usr/local/v14/bin/psql", - ) - parser.add_argument( - "--only-import", - dest="only_import", - required=False, - default=False, - action="store_true", - help="Skip export and tenant creation part", - ) - parser.add_argument( - "--work-dir", - dest="work_dir", - required=True, - default=False, - help="directory where temporary tar files are stored", - ) - parser.add_argument( - "--tmp-pg-port", - dest="tmp_pg_port", - required=False, - default=55439, - type=non_zero_tcp_port, - help="localhost port to use for temporary postgres instance", - ) - args = parser.parse_args() - main(args) diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py index b07e4bea9b..919a9278a9 100755 --- a/scripts/flaky_tests.py +++ b/scripts/flaky_tests.py @@ -3,11 +3,13 @@ import argparse import json import logging +import os from collections import defaultdict -from typing import DefaultDict, Dict +from typing import Any, DefaultDict, Dict, Optional import psycopg2 import psycopg2.extras +import toml FLAKY_TESTS_QUERY = """ SELECT @@ -45,6 +47,36 @@ def main(args: argparse.Namespace): logging.error("cannot fetch flaky tests from the DB due to an error", exc) rows = [] + # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring), + # use it to parametrize test name along with build_type and pg_version + # + # See test_runner/fixtures/parametrize.py for details + if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ( + "", + "tokio-epoll-uring", + ): + pageserver_virtual_file_io_engine_parameter = f"-{io_engine}" + else: + pageserver_virtual_file_io_engine_parameter = "" + + # re-use existing records of flaky tests from before parametrization by compaction_algorithm + def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + """Duplicated from parametrize.py""" + toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") + if toml_table is None: + return None + v = toml.loads(toml_table) + assert isinstance(v, dict) + return v + + pageserver_default_tenant_config_compaction_algorithm_parameter = "" + if ( + explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() + ) is not None: + pageserver_default_tenant_config_compaction_algorithm_parameter = ( + f"-{explicit_default['kind']}" + ) + for row in rows: # We don't want to automatically rerun tests in a performance suite if row["parent_suite"] != "test_runner.regress": @@ -53,10 +85,10 @@ def main(args: argparse.Namespace): if row["name"].endswith("]"): parametrized_test = row["name"].replace( "[", - f"[{build_type}-pg{pg_version}-", + f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-", ) else: - parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]" + parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]" res[row["parent_suite"]][row["suite"]][parametrized_test] = True diff --git a/scripts/generate_and_push_perf_report.sh b/scripts/generate_and_push_perf_report.sh index 9e03302b0f..178c570b13 100755 --- a/scripts/generate_and_push_perf_report.sh +++ b/scripts/generate_and_push_perf_report.sh @@ -8,17 +8,3 @@ SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) echo "Uploading perf report to neon pg" # ingest per test results data into neon backed postgres running in staging to build grafana reports on that data DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM" - -# Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository) -# so the problem occurs because poetry cannot find pyproject.toml in temp dir created by git upload -# shellcheck source=/dev/null -. "$(poetry env info --path)"/bin/activate - -echo "Uploading perf result to zenith-perf-data" -scripts/git-upload \ - --repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \ - --message="add performance test result for $GITHUB_SHA neon revision" \ - --branch=master \ - copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\ - --merge \ - --run-cmd "python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html" diff --git a/scripts/generate_perf_report_page.py b/scripts/generate_perf_report_page.py deleted file mode 100755 index b5b49bb600..0000000000 --- a/scripts/generate_perf_report_page.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, cast - -from jinja2 import Template - -# skip 'input' columns. They are included in the header and just blow the table -EXCLUDE_COLUMNS = frozenset( - { - "scale", - "duration", - "number_of_clients", - "number_of_threads", - "init_start_timestamp", - "init_end_timestamp", - "run_start_timestamp", - "run_end_timestamp", - } -) - -KEY_EXCLUDE_FIELDS = frozenset( - { - "init_start_timestamp", - "init_end_timestamp", - "run_start_timestamp", - "run_end_timestamp", - } -) -NEGATIVE_COLOR = "negative" -POSITIVE_COLOR = "positive" -EPS = 1e-6 - - -@dataclass -class SuitRun: - revision: str - values: Dict[str, Any] - - -@dataclass -class SuitRuns: - platform: str - suit: str - common_columns: List[Tuple[str, str]] - value_columns: List[str] - runs: List[SuitRun] - - -@dataclass -class RowValue: - value: str - color: str - ratio: str - - -def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]: - value_columns = [] - common_columns = [] - for item in values: - if item["name"] in KEY_EXCLUDE_FIELDS: - continue - if item["report"] != "test_param": - value_columns.append(cast(str, item["name"])) - else: - common_columns.append((cast(str, item["name"]), cast(str, item["value"]))) - value_columns.sort() - common_columns.sort(key=lambda x: x[0]) # sort by name - return common_columns, value_columns - - -def format_ratio(ratio: float, report: str) -> Tuple[str, str]: - color = "" - sign = "+" if ratio > 0 else "" - if abs(ratio) < 0.05: - return f" ({sign}{ratio:.2f})", color - - if report not in {"test_param", "higher_is_better", "lower_is_better"}: - raise ValueError(f"Unknown report type: {report}") - - if report == "test_param": - return f"{ratio:.2f}", color - - if ratio > 0: - if report == "higher_is_better": - color = POSITIVE_COLOR - elif report == "lower_is_better": - color = NEGATIVE_COLOR - elif ratio < 0: - if report == "higher_is_better": - color = NEGATIVE_COLOR - elif report == "lower_is_better": - color = POSITIVE_COLOR - - return f" ({sign}{ratio:.2f})", color - - -def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]: - for item in suit_run.values["data"]: - if item["name"] == name: - return cast(Dict[str, Any], item) - return None - - -def get_row_values( - columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun] -) -> List[RowValue]: - row_values = [] - for column in columns: - current_value = extract_value(column, run_result) - if current_value is None: - # should never happen - raise ValueError(f"{column} not found in {run_result.values}") - - value = current_value["value"] - if isinstance(value, float): - value = f"{value:.2f}" - - if prev_result is None: - row_values.append(RowValue(value, "", "")) - continue - - prev_value = extract_value(column, prev_result) - if prev_value is None: - # this might happen when new metric is added and there is no value for it in previous run - # let this be here, TODO add proper handling when this actually happens - raise ValueError(f"{column} not found in previous result") - # adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero - ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1 - ratio_display, color = format_ratio(ratio, current_value["report"]) - row_values.append(RowValue(value, color, ratio_display)) - return row_values - - -@dataclass -class SuiteRunTableRow: - revision: str - values: List[RowValue] - - -def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]: - rows = [] - prev_run = None - for run in runs: - rows.append( - SuiteRunTableRow( - revision=run.revision, values=get_row_values(value_columns, run, prev_run) - ) - ) - prev_run = run - - return rows - - -def main(args: argparse.Namespace) -> None: - input_dir = Path(args.input_dir) - grouped_runs: Dict[str, SuitRuns] = {} - # we have files in form: _.json - # fill them in the hashmap so we have grouped items for the - # same run configuration (scale, duration etc.) ordered by counter. - for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])): - run_data = json.loads(item.read_text()) - revision = run_data["revision"] - - for suit_result in run_data["result"]: - key = "{}{}".format(run_data["platform"], suit_result["suit"]) - # pack total duration as a synthetic value - total_duration = suit_result["total_duration"] - suit_result["data"].append( - { - "name": "total_duration", - "value": total_duration, - "unit": "s", - "report": "lower_is_better", - } - ) - common_columns, value_columns = get_columns(suit_result["data"]) - - grouped_runs.setdefault( - key, - SuitRuns( - platform=run_data["platform"], - suit=suit_result["suit"], - common_columns=common_columns, - value_columns=value_columns, - runs=[], - ), - ) - - grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result)) - context = {} - for result in grouped_runs.values(): - suit = result.suit - context[suit] = { - "common_columns": result.common_columns, - "value_columns": result.value_columns, - "platform": result.platform, - # reverse the order so newest results are on top of the table - "rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)), - } - - template = Template((Path(__file__).parent / "perf_report_template.html").read_text()) - - Path(args.out).write_text(template.render(context=context)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--input-dir", - dest="input_dir", - required=True, - help="Directory with jsons generated by the test suite", - ) - parser.add_argument("--out", required=True, help="Output html file path") - args = parser.parse_args() - main(args) diff --git a/scripts/git-upload b/scripts/git-upload deleted file mode 100755 index d56c0f8e94..0000000000 --- a/scripts/git-upload +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import os -import shlex -import shutil -import subprocess -import sys -import textwrap -from contextlib import contextmanager -from distutils.dir_util import copy_tree -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional - - -def absolute_path(path): - return Path(path).resolve() - - -def relative_path(path): - path = Path(path) - if path.is_absolute(): - raise Exception(f'path `{path}` must be relative!') - return path - - -@contextmanager -def chdir(cwd: Path): - old = os.getcwd() - os.chdir(cwd) - try: - yield cwd - finally: - os.chdir(old) - - -def run(cmd, *args, **kwargs): - print('$', ' '.join(cmd)) - subprocess.check_call(cmd, *args, **kwargs) - - -class GitRepo: - def __init__(self, url, branch: Optional[str] = None): - self.url = url - self.cwd = TemporaryDirectory() - self.branch = branch - - args = [ - 'git', - 'clone', - '--single-branch', - ] - if self.branch: - args.extend(['--branch', self.branch]) - - subprocess.check_call([ - *args, - str(url), - self.cwd.name, - ]) - - def is_dirty(self): - res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip() - return bool(res) - - def update(self, message, action, branch=None): - with chdir(self.cwd.name): - if not branch: - cmd = ['git', 'branch', '--show-current'] - branch = subprocess.check_output(cmd, text=True).strip() - - # Run action in repo's directory - action() - - run(['git', 'add', '.']) - - if not self.is_dirty(): - print('No changes detected, quitting') - return - - git_with_user = [ - 'git', - '-c', - 'user.name=vipvap', - '-c', - 'user.email=vipvap@zenith.tech', - ] - run(git_with_user + [ - 'commit', - '--author="vipvap "', - f'--message={message}', - ]) - - for _ in range(5): - try: - run(['git', 'fetch', 'origin', branch]) - run(git_with_user + ['rebase', f'origin/{branch}']) - run(['git', 'push', 'origin', branch]) - return - - except subprocess.CalledProcessError as e: - print(f'failed to update branch `{branch}`: {e}', file=sys.stderr) - - raise Exception(f'failed to update branch `{branch}`') - - -def do_copy(args): - src = args.src - dst = args.dst - - if args.forbid_overwrite and dst.exists(): - raise FileExistsError(f"File exists: '{dst}'") - - if src.is_dir(): - if not args.merge: - shutil.rmtree(dst, ignore_errors=True) - # distutils is deprecated, but this is a temporary workaround before python version bump - # here we need dir_exists_ok=True from shutil.copytree which is available in python 3.8+ - copy_tree(str(src), str(dst)) - else: - shutil.copy(src, dst) - - if args.run_cmd: - run(shlex.split(args.run_cmd)) - - -def main(): - parser = argparse.ArgumentParser(description='Git upload tool') - parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url') - parser.add_argument('--message', type=str, metavar='TEXT', help='commit message') - parser.add_argument('--branch', type=str, metavar='TEXT', help='target git repo branch') - - commands = parser.add_subparsers(title='commands', dest='subparser_name') - - p_copy = commands.add_parser( - 'copy', - help='copy file into the repo', - formatter_class=argparse.RawTextHelpFormatter, - ) - p_copy.add_argument('src', type=absolute_path, help='source path') - p_copy.add_argument('dst', type=relative_path, help='relative dest path') - p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites') - p_copy.add_argument( - '--merge', - action='store_true', - help='when copying a directory do not delete existing data, but add new files') - p_copy.add_argument('--run-cmd', - help=textwrap.dedent('''\ - run arbitrary cmd on top of copied files, - example usage is static content generation - based on current repository state\ - ''')) - - args = parser.parse_args() - - commands = { - 'copy': do_copy, - } - - action = commands.get(args.subparser_name) - if action: - message = args.message or 'update' - GitRepo(args.repo, args.branch).update(message, lambda: action(args)) - else: - parser.print_usage() - - -if __name__ == '__main__': - main() diff --git a/scripts/ingest_regress_test_result.py b/scripts/ingest_regress_test_result.py deleted file mode 100644 index 39c1c02941..0000000000 --- a/scripts/ingest_regress_test_result.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import logging -import os -import re -import sys -from contextlib import contextmanager -from pathlib import Path - -import backoff -import psycopg2 - -CREATE_TABLE = """ -CREATE TABLE IF NOT EXISTS regress_test_results ( - id SERIAL PRIMARY KEY, - reference CHAR(255), - revision CHAR(40), - build_type CHAR(16), - data JSONB -) -""" - - -def err(msg): - print(f"error: {msg}") - sys.exit(1) - - -@contextmanager -def get_connection_cursor(): - connstr = os.getenv("DATABASE_URL") - if not connstr: - err("DATABASE_URL environment variable is not set") - - @backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150) - def connect(connstr): - conn = psycopg2.connect(connstr, connect_timeout=30) - conn.autocommit = True - return conn - - conn = connect(connstr) - try: - with conn.cursor() as cur: - yield cur - finally: - if conn is not None: - conn.close() - - -def create_table(cur): - cur.execute(CREATE_TABLE) - - -def ingest_regress_test_result( - cursor, reference: str, revision: str, build_type: str, data_file: Path -): - data = data_file.read_text() - # In the JSON report we can have lines related to LazyFixture with escaped double-quote - # It's hard to insert them into jsonb field as is, so replace \" with ' to make it easier for us - # - # "" -> "" - data = re.sub(r'("")', r"\g<1>'\g<2>'\g<3>", data) - values = ( - reference, - revision, - build_type, - data, - ) - cursor.execute( - """ - INSERT INTO regress_test_results ( - reference, - revision, - build_type, - data - ) VALUES (%s, %s, %s, %s) - """, - values, - ) - - -def main(): - parser = argparse.ArgumentParser( - description="Regress test result uploader. \ - Database connection string should be provided via DATABASE_URL environment variable", - ) - parser.add_argument("--initdb", action="store_true", help="Initialuze database") - parser.add_argument( - "--reference", type=str, required=True, help="git reference, for example refs/heads/main" - ) - parser.add_argument("--revision", type=str, required=True, help="git revision") - parser.add_argument( - "--build-type", type=str, required=True, help="build type: release, debug or remote" - ) - parser.add_argument( - "--ingest", type=Path, required=True, help="Path to regress test result file" - ) - - args = parser.parse_args() - with get_connection_cursor() as cur: - if args.initdb: - create_table(cur) - - if not args.ingest.exists(): - err(f"ingest path {args.ingest} does not exist") - - ingest_regress_test_result( - cur, - reference=args.reference, - revision=args.revision, - build_type=args.build_type, - data_file=args.ingest, - ) - - -if __name__ == "__main__": - logging.getLogger("backoff").addHandler(logging.StreamHandler()) - main() diff --git a/scripts/ps_ec2_setup_instance_store b/scripts/ps_ec2_setup_instance_store new file mode 100755 index 0000000000..1f88f252eb --- /dev/null +++ b/scripts/ps_ec2_setup_instance_store @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +# This script sets up an ext4 partition on an EC2 storage-optimized instance's instance store volume. +# Unix permission/ownership is set to the calling user (the script does sudo internally.) +# +# It's intentionally not idempotent; don't take on that complexity in a bash script. + +set -euo pipefail +set -x + +# This seems crude, but, apparently instance store NVMe volumes aren't exposed in the in instance metadata block-device-mapping. +# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/block-device-mapping-concepts.html#bdm-instance-metadata +if [ "$(cat /sys/class/block/nvme1n1/device/model)" != "Amazon EC2 NVMe Instance Storage " ]; then + echo "nvme1n1 is not Amazon EC2 NVMe Instance Storage: '$(cat /sys/class/block/nvme1n1/device/model)'" + exit 1 +fi + +# NB: we DO NOT warm up all the blocks on the drive as recommended by https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/disk-performance.html +# The reason is that we don't do that in production either. + +# do all the on-disk initialization work now instead of a background kernel thread +# so that we're ready for benchmarking right after this line +sudo mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/nvme1n1 + +MOUNTPOINT=/instance_store +sudo mkdir "$MOUNTPOINT" +sudo mount /dev/nvme1n1 "$MOUNTPOINT" +sudo chown -R "$(id -u)":"$(id -g)" "$MOUNTPOINT" + +TEST_OUTPUT="$MOUNTPOINT/test_output" +mkdir "$TEST_OUTPUT" + +NEON_REPO_DIR="$MOUNTPOINT/repo_dir" +mkdir "$NEON_REPO_DIR" + +cat </dev/null | jq --raw-output '.jwt') +AUTH_TOKEN=$(curl https://console-stage.neon.build/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # prod: AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') # check diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index d66cbefa45..1a6fb7fedf 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -147,6 +147,7 @@ async fn publish(client: Option, n_keys: u64) { http_connstr: "zenith-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, + standby_horizon: 0, }; counter += 1; yield info; diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index 7d1b63d23f..a420fd9c66 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -42,6 +42,7 @@ message SafekeeperTimelineInfo { uint64 remote_consistent_lsn = 7; uint64 peer_horizon_lsn = 8; uint64 local_start_lsn = 9; + uint64 standby_horizon = 14; // A connection string to use for WAL receiving. string safekeeper_connstr = 10; // HTTP endpoint connection string @@ -105,4 +106,6 @@ message SafekeeperDiscoveryResponse { string safekeeper_connstr = 4; // Availability zone of a safekeeper. optional string availability_zone = 5; + // Replica apply LSN + uint64 standby_horizon = 6; } diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 4e5f8ed724..0a4af543ab 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -196,8 +196,13 @@ impl SubscriptionKey { /// Parse from FilterTenantTimelineId pub fn from_proto_filter_tenant_timeline_id( - f: &FilterTenantTimelineId, + opt: Option<&FilterTenantTimelineId>, ) -> Result { + if opt.is_none() { + return Ok(SubscriptionKey::All); + } + + let f = opt.unwrap(); if !f.enabled { return Ok(SubscriptionKey::All); } @@ -534,10 +539,7 @@ impl BrokerService for Broker { .remote_addr() .expect("TCPConnectInfo inserted by handler"); let proto_filter = request.into_inner(); - let ttid_filter = proto_filter - .tenant_timeline_id - .as_ref() - .ok_or_else(|| Status::new(Code::InvalidArgument, "missing tenant_timeline_id"))?; + let ttid_filter = proto_filter.tenant_timeline_id.as_ref(); let sub_key = SubscriptionKey::from_proto_filter_tenant_timeline_id(ttid_filter)?; let types_set = proto_filter @@ -734,6 +736,7 @@ mod tests { http_connstr: "neon-1-sk-1.local:7677".to_owned(), local_start_lsn: 0, availability_zone: None, + standby_horizon: 0, }) } diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml new file mode 100644 index 0000000000..b54dea5d47 --- /dev/null +++ b/storage_controller/Cargo.toml @@ -0,0 +1,55 @@ +[package] +name = "storage_controller" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[[bin]] +name = "storage_controller" +path = "src/main.rs" + +[features] +default = [] +# Enables test-only APIs and behaviors +testing = [] + +[dependencies] +anyhow.workspace = true +aws-config.workspace = true +bytes.workspace = true +camino.workspace = true +clap.workspace = true +fail.workspace = true +futures.workspace = true +git-version.workspace = true +hex.workspace = true +hyper.workspace = true +humantime.workspace = true +itertools.workspace = true +lasso.workspace = true +once_cell.workspace = true +pageserver_api.workspace = true +pageserver_client.workspace = true +postgres_connection.workspace = true +reqwest = { workspace = true, features = ["stream"] } +routerify.workspace = true +serde.workspace = true +serde_json.workspace = true +thiserror.workspace = true +tokio.workspace = true +tokio-util.workspace = true +tracing.workspace = true +measured.workspace = true +scopeguard.workspace = true +strum.workspace = true +strum_macros.workspace = true + +diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } +diesel_migrations = { version = "2.1.0" } +r2d2 = { version = "0.8.10" } + +utils = { path = "../libs/utils/" } +metrics = { path = "../libs/metrics/" } +control_plane = { path = "../control_plane" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } + diff --git a/storage_controller/migrations/.keep b/storage_controller/migrations/.keep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql new file mode 100644 index 0000000000..a9f5260911 --- /dev/null +++ b/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql @@ -0,0 +1,6 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + +DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass); +DROP FUNCTION IF EXISTS diesel_set_updated_at(); diff --git a/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql new file mode 100644 index 0000000000..d68895b1a7 --- /dev/null +++ b/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql @@ -0,0 +1,36 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + + + + +-- Sets up a trigger for the given table to automatically set a column called +-- `updated_at` whenever the row is modified (unless `updated_at` was included +-- in the modified columns) +-- +-- # Example +-- +-- ```sql +-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW()); +-- +-- SELECT diesel_manage_updated_at('users'); +-- ``` +CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$ +BEGIN + EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s + FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl); +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$ +BEGIN + IF ( + NEW IS DISTINCT FROM OLD AND + NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at + ) THEN + NEW.updated_at := current_timestamp; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; diff --git a/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql new file mode 100644 index 0000000000..b875b91c00 --- /dev/null +++ b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql @@ -0,0 +1 @@ +DROP TABLE tenant_shards; diff --git a/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql new file mode 100644 index 0000000000..2ffdae6287 --- /dev/null +++ b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql @@ -0,0 +1,13 @@ +CREATE TABLE tenant_shards ( + tenant_id VARCHAR NOT NULL, + shard_number INTEGER NOT NULL, + shard_count INTEGER NOT NULL, + PRIMARY KEY(tenant_id, shard_number, shard_count), + shard_stripe_size INTEGER NOT NULL, + generation INTEGER NOT NULL, + generation_pageserver BIGINT NOT NULL, + placement_policy VARCHAR NOT NULL, + splitting SMALLINT NOT NULL, + -- config is JSON encoded, opaque to the database. + config TEXT NOT NULL +); \ No newline at end of file diff --git a/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql new file mode 100644 index 0000000000..ec303bc8cf --- /dev/null +++ b/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql @@ -0,0 +1 @@ +DROP TABLE nodes; diff --git a/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql new file mode 100644 index 0000000000..9be0880fa4 --- /dev/null +++ b/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql @@ -0,0 +1,10 @@ +CREATE TABLE nodes ( + node_id BIGINT PRIMARY KEY NOT NULL, + + scheduling_policy VARCHAR NOT NULL, + + listen_http_addr VARCHAR NOT NULL, + listen_http_port INTEGER NOT NULL, + listen_pg_addr VARCHAR NOT NULL, + listen_pg_port INTEGER NOT NULL +); \ No newline at end of file diff --git a/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql new file mode 100644 index 0000000000..503231f69d --- /dev/null +++ b/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE tenant_shards ALTER generation SET NOT NULL; +ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL; diff --git a/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql new file mode 100644 index 0000000000..7e1e3cfe90 --- /dev/null +++ b/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql @@ -0,0 +1,4 @@ + + +ALTER TABLE tenant_shards ALTER generation DROP NOT NULL; +ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql new file mode 100644 index 0000000000..897c7e0d01 --- /dev/null +++ b/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql @@ -0,0 +1,3 @@ + +UPDATE tenant_shards set placement_policy='{"Double": 1}' where placement_policy='{"Attached": 1}'; +UPDATE tenant_shards set placement_policy='"Single"' where placement_policy='{"Attached": 0}'; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql new file mode 100644 index 0000000000..c898ac9aee --- /dev/null +++ b/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql @@ -0,0 +1,3 @@ + +UPDATE tenant_shards set placement_policy='{"Attached": 1}' where placement_policy='{"Double": 1}'; +UPDATE tenant_shards set placement_policy='{"Attached": 0}' where placement_policy='"Single"'; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql new file mode 100644 index 0000000000..33c06dc03d --- /dev/null +++ b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql @@ -0,0 +1,3 @@ +-- This file should undo anything in `up.sql` + +ALTER TABLE tenant_shards drop scheduling_policy; \ No newline at end of file diff --git a/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql new file mode 100644 index 0000000000..aa00f0d2ca --- /dev/null +++ b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql @@ -0,0 +1,2 @@ + +ALTER TABLE tenant_shards add scheduling_policy VARCHAR NOT NULL DEFAULT '"Active"'; diff --git a/storage_controller/src/auth.rs b/storage_controller/src/auth.rs new file mode 100644 index 0000000000..ef47abf8c7 --- /dev/null +++ b/storage_controller/src/auth.rs @@ -0,0 +1,9 @@ +use utils::auth::{AuthError, Claims, Scope}; + +pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> { + if claims.scope != required_scope { + return Err(AuthError("Scope mismatch. Permission denied".into())); + } + + Ok(()) +} diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs new file mode 100644 index 0000000000..74b7e7c849 --- /dev/null +++ b/storage_controller/src/background_node_operations.rs @@ -0,0 +1,59 @@ +use std::{borrow::Cow, fmt::Debug, fmt::Display}; + +use tokio_util::sync::CancellationToken; +use utils::id::NodeId; + +pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10; + +#[derive(Copy, Clone)] +pub(crate) struct Drain { + pub(crate) node_id: NodeId, +} + +#[derive(Copy, Clone)] +pub(crate) struct Fill { + pub(crate) node_id: NodeId, +} + +#[derive(Copy, Clone)] +pub(crate) enum Operation { + Drain(Drain), + Fill(Fill), +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum OperationError { + #[error("Node state changed during operation: {0}")] + NodeStateChanged(Cow<'static, str>), + #[error("Operation finalize error: {0}")] + FinalizeError(Cow<'static, str>), + #[error("Operation cancelled")] + Cancelled, +} + +pub(crate) struct OperationHandler { + pub(crate) operation: Operation, + #[allow(unused)] + pub(crate) cancel: CancellationToken, +} + +impl Display for Drain { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "drain {}", self.node_id) + } +} + +impl Display for Fill { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "fill {}", self.node_id) + } +} + +impl Display for Operation { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Operation::Drain(op) => write!(f, "{op}"), + Operation::Fill(op) => write!(f, "{op}"), + } + } +} diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs new file mode 100644 index 0000000000..a1d051f150 --- /dev/null +++ b/storage_controller/src/compute_hook.rs @@ -0,0 +1,693 @@ +use std::sync::Arc; +use std::{collections::HashMap, time::Duration}; + +use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; +use control_plane::local_env::LocalEnv; +use futures::StreamExt; +use hyper::StatusCode; +use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; +use postgres_connection::parse_host_port; +use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; +use tracing::{info_span, Instrument}; +use utils::{ + backoff::{self}, + id::{NodeId, TenantId}, +}; + +use crate::service::Config; + +const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); + +const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + +pub(crate) const API_CONCURRENCY: usize = 32; + +struct UnshardedComputeHookTenant { + // Which node is this tenant attached to + node_id: NodeId, + + // Must hold this lock to send a notification. + send_lock: Arc>>, +} +struct ShardedComputeHookTenant { + stripe_size: ShardStripeSize, + shard_count: ShardCount, + shards: Vec<(ShardNumber, NodeId)>, + + // Must hold this lock to send a notification. The contents represent + // the last successfully sent notification, and are used to coalesce multiple + // updates by only sending when there is a chance since our last successful send. + send_lock: Arc>>, +} + +enum ComputeHookTenant { + Unsharded(UnshardedComputeHookTenant), + Sharded(ShardedComputeHookTenant), +} + +impl ComputeHookTenant { + /// Construct with at least one shard's information + fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self { + if tenant_shard_id.shard_count.count() > 1 { + Self::Sharded(ShardedComputeHookTenant { + shards: vec![(tenant_shard_id.shard_number, node_id)], + stripe_size, + shard_count: tenant_shard_id.shard_count, + send_lock: Arc::default(), + }) + } else { + Self::Unsharded(UnshardedComputeHookTenant { + node_id, + send_lock: Arc::default(), + }) + } + } + + fn get_send_lock(&self) -> &Arc>> { + match self { + Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, + Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, + } + } + + /// Set one shard's location. If stripe size or shard count have changed, Self is reset + /// and drops existing content. + fn update( + &mut self, + tenant_shard_id: TenantShardId, + stripe_size: ShardStripeSize, + node_id: NodeId, + ) { + match self { + Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => { + unsharded_tenant.node_id = node_id + } + Self::Sharded(sharded_tenant) + if sharded_tenant.stripe_size == stripe_size + && sharded_tenant.shard_count == tenant_shard_id.shard_count => + { + if let Some(existing) = sharded_tenant + .shards + .iter() + .position(|s| s.0 == tenant_shard_id.shard_number) + { + sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id; + } else { + sharded_tenant + .shards + .push((tenant_shard_id.shard_number, node_id)); + sharded_tenant.shards.sort_by_key(|s| s.0) + } + } + _ => { + // Shard count changed: reset struct. + *self = Self::new(tenant_shard_id, stripe_size, node_id); + } + } + } +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +struct ComputeHookNotifyRequestShard { + node_id: NodeId, + shard_number: ShardNumber, +} + +/// Request body that we send to the control plane to notify it of where a tenant is attached +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +struct ComputeHookNotifyRequest { + tenant_id: TenantId, + stripe_size: Option, + shards: Vec, +} + +/// Error type for attempts to call into the control plane compute notification hook +#[derive(thiserror::Error, Debug)] +pub(crate) enum NotifyError { + // Request was not send successfully, e.g. transport error + #[error("Sending request: {0}")] + Request(#[from] reqwest::Error), + // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon. + #[error("Control plane tenant busy")] + Busy, + // Explicit 429 response asking us to retry less frequently + #[error("Control plane overloaded")] + SlowDown, + // A 503 response indicates the control plane can't handle the request right now + #[error("Control plane unavailable (status {0})")] + Unavailable(StatusCode), + // API returned unexpected non-success status. We will retry, but log a warning. + #[error("Control plane returned unexpected status {0}")] + Unexpected(StatusCode), + // We shutdown while sending + #[error("Shutting down")] + ShuttingDown, + // A response indicates we will never succeed, such as 400 or 404 + #[error("Non-retryable error {0}")] + Fatal(StatusCode), +} + +enum MaybeSendResult { + // Please send this request while holding the lock, and if you succeed then write + // the request into the lock. + Transmit( + ( + ComputeHookNotifyRequest, + tokio::sync::OwnedMutexGuard>, + ), + ), + // Something requires sending, but you must wait for a current sender then call again + AwaitLock(Arc>>), + // Nothing requires sending + Noop, +} + +impl ComputeHookTenant { + fn maybe_send( + &self, + tenant_id: TenantId, + lock: Option>>, + ) -> MaybeSendResult { + let locked = match lock { + Some(already_locked) => already_locked, + None => { + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock. + let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else { + return MaybeSendResult::AwaitLock(self.get_send_lock().clone()); + }; + locked + } + }; + + let request = match self { + Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest { + tenant_id, + shards: vec![ComputeHookNotifyRequestShard { + shard_number: ShardNumber(0), + node_id: unsharded_tenant.node_id, + }], + stripe_size: None, + }), + Self::Sharded(sharded_tenant) + if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => + { + Some(ComputeHookNotifyRequest { + tenant_id, + shards: sharded_tenant + .shards + .iter() + .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard { + shard_number: *shard_number, + node_id: *node_id, + }) + .collect(), + stripe_size: Some(sharded_tenant.stripe_size), + }) + } + Self::Sharded(sharded_tenant) => { + // Sharded tenant doesn't yet have information for all its shards + + tracing::info!( + "ComputeHookTenant::maybe_send: not enough shards ({}/{})", + sharded_tenant.shards.len(), + sharded_tenant.shard_count.count() + ); + None + } + }; + + match request { + None => { + // Not yet ready to emit a notification + tracing::info!("Tenant isn't yet ready to emit a notification"); + MaybeSendResult::Noop + } + Some(request) if Some(&request) == locked.as_ref() => { + // No change from the last value successfully sent + MaybeSendResult::Noop + } + Some(request) => MaybeSendResult::Transmit((request, locked)), + } + } +} + +/// The compute hook is a destination for notifications about changes to tenant:pageserver +/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures +/// the compute connection string. +pub(super) struct ComputeHook { + config: Config, + state: std::sync::Mutex>, + authorization_header: Option, + + // Concurrency limiter, so that we do not overload the cloud control plane when updating + // large numbers of tenants (e.g. when failing over after a node failure) + api_concurrency: tokio::sync::Semaphore, + + // This lock is only used in testing enviroments, to serialize calls into neon_lock + neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, +} + +impl ComputeHook { + pub(super) fn new(config: Config) -> Self { + let authorization_header = config + .control_plane_jwt_token + .clone() + .map(|jwt| format!("Bearer {}", jwt)); + + let client = reqwest::ClientBuilder::new() + .timeout(NOTIFY_REQUEST_TIMEOUT) + .build() + .expect("Failed to construct HTTP client"); + + Self { + state: Default::default(), + config, + authorization_header, + neon_local_lock: Default::default(), + api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, + } + } + + /// For test environments: use neon_local's LocalEnv to update compute + async fn do_notify_local( + &self, + reconfigure_request: &ComputeHookNotifyRequest, + ) -> anyhow::Result<()> { + // neon_local updates are not safe to call concurrently, use a lock to serialize + // all calls to this function + let _locked = self.neon_local_lock.lock().await; + + let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else { + tracing::warn!( + "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update" + ); + return Ok(()); + }; + let env = match LocalEnv::load_config(repo_dir) { + Ok(e) => e, + Err(e) => { + tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); + return Ok(()); + } + }; + let cplane = + ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); + let ComputeHookNotifyRequest { + tenant_id, + shards, + stripe_size, + } = reconfigure_request; + + let compute_pageservers = shards + .iter() + .map(|shard| { + let ps_conf = env + .get_pageserver_conf(shard.node_id) + .expect("Unknown pageserver"); + let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr) + .expect("Unable to parse listen_pg_addr"); + (pg_host, pg_port.unwrap_or(5432)) + }) + .collect::>(); + + for (endpoint_name, endpoint) in &cplane.endpoints { + if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { + tracing::info!("Reconfiguring endpoint {}", endpoint_name,); + endpoint + .reconfigure(compute_pageservers.clone(), *stripe_size) + .await?; + } + } + + Ok(()) + } + + async fn do_notify_iteration( + &self, + url: &String, + reconfigure_request: &ComputeHookNotifyRequest, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let req = self.client.request(reqwest::Method::PUT, url); + let req = if let Some(value) = &self.authorization_header { + req.header(reqwest::header::AUTHORIZATION, value) + } else { + req + }; + + tracing::info!( + "Sending notify request to {} ({:?})", + url, + reconfigure_request + ); + let send_result = req.json(&reconfigure_request).send().await; + let response = match send_result { + Ok(r) => r, + Err(e) => return Err(e.into()), + }; + + // Treat all 2xx responses as success + if response.status() >= reqwest::StatusCode::OK + && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES + { + if response.status() != reqwest::StatusCode::OK { + // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so + // log a warning. + tracing::warn!( + "Unexpected 2xx response code {} from control plane", + response.status() + ); + } + + return Ok(()); + } + + // Error response codes + match response.status() { + reqwest::StatusCode::TOO_MANY_REQUESTS => { + // TODO: 429 handling should be global: set some state visible to other requests + // so that they will delay before starting, rather than all notifications trying + // once before backing off. + tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled()) + .await + .ok(); + Err(NotifyError::SlowDown) + } + reqwest::StatusCode::LOCKED => { + // We consider this fatal, because it's possible that the operation blocking the control one is + // also the one that is waiting for this reconcile. We should let the reconciler calling + // this hook fail, to give control plane a chance to un-lock. + tracing::info!("Control plane reports tenant is locked, dropping out of notify"); + Err(NotifyError::Busy) + } + reqwest::StatusCode::SERVICE_UNAVAILABLE => { + Err(NotifyError::Unavailable(StatusCode::SERVICE_UNAVAILABLE)) + } + reqwest::StatusCode::GATEWAY_TIMEOUT => { + Err(NotifyError::Unavailable(StatusCode::GATEWAY_TIMEOUT)) + } + reqwest::StatusCode::BAD_GATEWAY => { + Err(NotifyError::Unavailable(StatusCode::BAD_GATEWAY)) + } + + reqwest::StatusCode::BAD_REQUEST => Err(NotifyError::Fatal(StatusCode::BAD_REQUEST)), + reqwest::StatusCode::UNAUTHORIZED => Err(NotifyError::Fatal(StatusCode::UNAUTHORIZED)), + reqwest::StatusCode::FORBIDDEN => Err(NotifyError::Fatal(StatusCode::FORBIDDEN)), + status => Err(NotifyError::Unexpected( + hyper::StatusCode::from_u16(status.as_u16()) + .unwrap_or(StatusCode::INTERNAL_SERVER_ERROR), + )), + } + } + + async fn do_notify( + &self, + url: &String, + reconfigure_request: &ComputeHookNotifyRequest, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + // We hold these semaphore units across all retries, rather than only across each + // HTTP request: this is to preserve fairness and avoid a situation where a retry might + // time out waiting for a semaphore. + let _units = self + .api_concurrency + .acquire() + .await + // Interpret closed semaphore as shutdown + .map_err(|_| NotifyError::ShuttingDown)?; + + backoff::retry( + || self.do_notify_iteration(url, reconfigure_request, cancel), + |e| { + matches!( + e, + NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy + ) + }, + 3, + 10, + "Send compute notification", + cancel, + ) + .await + .ok_or_else(|| NotifyError::ShuttingDown) + .and_then(|x| x) + } + + /// Synchronous phase: update the per-tenant state for the next intended notification + fn notify_prepare( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + stripe_size: ShardStripeSize, + ) -> MaybeSendResult { + let mut state_locked = self.state.lock().unwrap(); + + use std::collections::hash_map::Entry; + let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { + Entry::Vacant(e) => e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + node_id, + )), + Entry::Occupied(e) => { + let tenant = e.into_mut(); + tenant.update(tenant_shard_id, stripe_size, node_id); + tenant + } + }; + tenant.maybe_send(tenant_shard_id.tenant_id, None) + } + + async fn notify_execute( + &self, + maybe_send_result: MaybeSendResult, + tenant_shard_id: TenantShardId, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + // Process result: we may get an update to send, or we may have to wait for a lock + // before trying again. + let (request, mut send_lock_guard) = match maybe_send_result { + MaybeSendResult::Noop => { + return Ok(()); + } + MaybeSendResult::AwaitLock(send_lock) => { + let send_locked = tokio::select! { + guard = send_lock.lock_owned() => {guard}, + _ = cancel.cancelled() => { + return Err(NotifyError::ShuttingDown) + } + }; + + // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here + // we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses + // try_lock. + let state_locked = self.state.lock().unwrap(); + let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else { + return Ok(()); + }; + match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) { + MaybeSendResult::AwaitLock(_) => { + unreachable!("We supplied lock guard") + } + MaybeSendResult::Noop => { + return Ok(()); + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), + } + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), + }; + + let result = if let Some(notify_url) = &self.config.compute_hook_url { + self.do_notify(notify_url, &request, cancel).await + } else { + self.do_notify_local(&request).await.map_err(|e| { + // This path is for testing only, so munge the error into our prod-style error type. + tracing::error!("Local notification hook failed: {e}"); + NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) + }) + }; + + if result.is_ok() { + // Before dropping the send lock, stash the request we just sent so that + // subsequent callers can avoid redundantly re-sending the same thing. + *send_lock_guard = Some(request); + } + result + } + + /// Infallible synchronous fire-and-forget version of notify(), that sends its results to + /// a channel. Something should consume the channel and arrange to try notifying again + /// if something failed. + pub(super) fn notify_background( + self: &Arc, + notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>, + result_tx: tokio::sync::mpsc::Sender>, + cancel: &CancellationToken, + ) { + let mut maybe_sends = Vec::new(); + for (tenant_shard_id, node_id, stripe_size) in notifications { + let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + maybe_sends.push((tenant_shard_id, maybe_send_result)) + } + + let this = self.clone(); + let cancel = cancel.clone(); + + tokio::task::spawn(async move { + // Construct an async stream of futures to invoke the compute notify function: we do this + // in order to subsequently use .buffered() on the stream to execute with bounded parallelism. The + // ComputeHook semaphore already limits concurrency, but this way we avoid constructing+polling lots of futures which + // would mostly just be waiting on that semaphore. + let mut stream = futures::stream::iter(maybe_sends) + .map(|(tenant_shard_id, maybe_send_result)| { + let this = this.clone(); + let cancel = cancel.clone(); + + async move { + this + .notify_execute(maybe_send_result, tenant_shard_id, &cancel) + .await.map_err(|e| (tenant_shard_id, e)) + }.instrument(info_span!( + "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() + )) + }) + .buffered(API_CONCURRENCY); + + loop { + tokio::select! { + next = stream.next() => { + match next { + Some(r) => { + result_tx.send(r).await.ok(); + }, + None => { + tracing::info!("Finished sending background compute notifications"); + break; + } + } + }, + _ = cancel.cancelled() => { + tracing::info!("Shutdown while running background compute notifications"); + break; + } + }; + } + }); + } + + /// Call this to notify the compute (postgres) tier of new pageservers to use + /// for a tenant. notify() is called by each shard individually, and this function + /// will decide whether an update to the tenant is sent. An update is sent on the + /// condition that: + /// - We know a pageserver for every shard. + /// - All the shards have the same shard_count (i.e. we are not mid-split) + /// + /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler + /// that is cancelled. + /// + /// This function is fallible, including in the case that the control plane is transiently + /// unavailable. A limited number of retries are done internally to efficiently hide short unavailability + /// periods, but we don't retry forever. The **caller** is responsible for handling failures and + /// ensuring that they eventually call again to ensure that the compute is eventually notified of + /// the proper pageserver nodes for a tenant. + #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))] + pub(super) async fn notify( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + stripe_size: ShardStripeSize, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size); + self.notify_execute(maybe_send_result, tenant_shard_id, cancel) + .await + } +} + +#[cfg(test)] +pub(crate) mod tests { + use pageserver_api::shard::{ShardCount, ShardNumber}; + use utils::id::TenantId; + + use super::*; + + #[test] + fn tenant_updates() -> anyhow::Result<()> { + let tenant_id = TenantId::generate(); + let mut tenant_state = ComputeHookTenant::new( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(0), + shard_number: ShardNumber(0), + }, + ShardStripeSize(12345), + NodeId(1), + ); + + // An unsharded tenant is always ready to emit a notification, but won't + // send the same one twice + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 1); + assert!(request.stripe_size.is_none()); + + // Simulate successful send + *guard = Some(request); + drop(guard); + + // Try asking again: this should be a no-op + let send_result = tenant_state.maybe_send(tenant_id, None); + assert!(matches!(send_result, MaybeSendResult::Noop)); + + // Writing the first shard of a multi-sharded situation (i.e. in a split) + // resets the tenant state and puts it in an non-notifying state (need to + // see all shards) + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(1), + }, + ShardStripeSize(32768), + NodeId(1), + ); + assert!(matches!( + tenant_state.maybe_send(tenant_id, None), + MaybeSendResult::Noop + )); + + // Writing the second shard makes it ready to notify + tenant_state.update( + TenantShardId { + tenant_id, + shard_count: ShardCount::new(2), + shard_number: ShardNumber(0), + }, + ShardStripeSize(32768), + NodeId(1), + ); + + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 2); + assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); + + // Simulate successful send + *guard = Some(request); + drop(guard); + + Ok(()) + } +} diff --git a/storage_controller/src/heartbeater.rs b/storage_controller/src/heartbeater.rs new file mode 100644 index 0000000000..14cda0a289 --- /dev/null +++ b/storage_controller/src/heartbeater.rs @@ -0,0 +1,244 @@ +use futures::{stream::FuturesUnordered, StreamExt}; +use std::{ + collections::HashMap, + sync::Arc, + time::{Duration, Instant}, +}; +use tokio_util::sync::CancellationToken; + +use pageserver_api::{ + controller_api::{NodeAvailability, UtilizationScore}, + models::PageserverUtilization, +}; + +use thiserror::Error; +use utils::id::NodeId; + +use crate::node::Node; + +struct HeartbeaterTask { + receiver: tokio::sync::mpsc::UnboundedReceiver, + cancel: CancellationToken, + + state: HashMap, + + max_unavailable_interval: Duration, + jwt_token: Option, +} + +#[derive(Debug, Clone)] +pub(crate) enum PageserverState { + Available { + last_seen_at: Instant, + utilization: PageserverUtilization, + new: bool, + }, + Offline, +} + +#[derive(Debug)] +pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>); + +#[derive(Debug, Error)] +pub(crate) enum HeartbeaterError { + #[error("Cancelled")] + Cancel, +} + +struct HeartbeatRequest { + pageservers: Arc>, + reply: tokio::sync::oneshot::Sender>, +} + +pub(crate) struct Heartbeater { + sender: tokio::sync::mpsc::UnboundedSender, +} + +impl Heartbeater { + pub(crate) fn new( + jwt_token: Option, + max_unavailable_interval: Duration, + cancel: CancellationToken, + ) -> Self { + let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::(); + let mut heartbeater = + HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel); + tokio::task::spawn(async move { heartbeater.run().await }); + + Self { sender } + } + + pub(crate) async fn heartbeat( + &self, + pageservers: Arc>, + ) -> Result { + let (sender, receiver) = tokio::sync::oneshot::channel(); + self.sender + .send(HeartbeatRequest { + pageservers, + reply: sender, + }) + .unwrap(); + + receiver.await.unwrap() + } +} + +impl HeartbeaterTask { + fn new( + receiver: tokio::sync::mpsc::UnboundedReceiver, + jwt_token: Option, + max_unavailable_interval: Duration, + cancel: CancellationToken, + ) -> Self { + Self { + receiver, + cancel, + state: HashMap::new(), + max_unavailable_interval, + jwt_token, + } + } + + async fn run(&mut self) { + loop { + tokio::select! { + request = self.receiver.recv() => { + match request { + Some(req) => { + let res = self.heartbeat(req.pageservers).await; + req.reply.send(res).unwrap(); + }, + None => { return; } + } + }, + _ = self.cancel.cancelled() => return + } + } + } + + async fn heartbeat( + &mut self, + pageservers: Arc>, + ) -> Result { + let mut new_state = HashMap::new(); + + let mut heartbeat_futs = FuturesUnordered::new(); + for (node_id, node) in &*pageservers { + heartbeat_futs.push({ + let jwt_token = self.jwt_token.clone(); + let cancel = self.cancel.clone(); + let new_node = !self.state.contains_key(node_id); + + // Clone the node and mark it as available such that the request + // goes through to the pageserver even when the node is marked offline. + // This doesn't impact the availability observed by [`crate::service::Service`]. + let mut node = node.clone(); + node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + + async move { + let response = node + .with_client_retries( + |client| async move { client.get_utilization().await }, + &jwt_token, + 3, + 3, + Duration::from_secs(1), + &cancel, + ) + .await; + + let response = match response { + Some(r) => r, + None => { + // This indicates cancellation of the request. + // We ignore the node in this case. + return None; + } + }; + + let status = if let Ok(utilization) = response { + PageserverState::Available { + last_seen_at: Instant::now(), + utilization, + new: new_node, + } + } else { + PageserverState::Offline + }; + + Some((*node_id, status)) + } + }); + + loop { + let maybe_status = tokio::select! { + next = heartbeat_futs.next() => { + match next { + Some(result) => result, + None => { break; } + } + }, + _ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); } + }; + + if let Some((node_id, status)) = maybe_status { + new_state.insert(node_id, status); + } + } + } + tracing::info!( + "Heartbeat round complete for {} nodes, {} offline", + new_state.len(), + new_state + .values() + .filter(|s| match s { + PageserverState::Available { .. } => { + false + } + PageserverState::Offline => true, + }) + .count() + ); + + let mut deltas = Vec::new(); + let now = Instant::now(); + for (node_id, ps_state) in new_state { + use std::collections::hash_map::Entry::*; + let entry = self.state.entry(node_id); + + let mut needs_update = false; + match entry { + Occupied(ref occ) => match (occ.get(), &ps_state) { + (PageserverState::Offline, PageserverState::Offline) => {} + (PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => { + if now - *last_seen_at >= self.max_unavailable_interval { + deltas.push((node_id, ps_state.clone())); + needs_update = true; + } + } + _ => { + deltas.push((node_id, ps_state.clone())); + needs_update = true; + } + }, + Vacant(_) => { + // This is a new node. Don't generate a delta for it. + deltas.push((node_id, ps_state.clone())); + } + } + + match entry { + Occupied(mut occ) if needs_update => { + (*occ.get_mut()) = ps_state; + } + Vacant(vac) => { + vac.insert(ps_state); + } + _ => {} + } + } + + Ok(AvailablityDeltas(deltas)) + } +} diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs new file mode 100644 index 0000000000..3e9951fb9e --- /dev/null +++ b/storage_controller/src/http.rs @@ -0,0 +1,979 @@ +use crate::metrics::{ + HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, + METRICS_REGISTRY, +}; +use crate::reconciler::ReconcileError; +use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT}; +use anyhow::Context; +use futures::Future; +use hyper::header::CONTENT_TYPE; +use hyper::{Body, Request, Response}; +use hyper::{StatusCode, Uri}; +use metrics::{BuildInfo, NeonMetrics}; +use pageserver_api::models::{ + TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, + TenantTimeTravelRequest, TimelineCreateRequest, +}; +use pageserver_api::shard::TenantShardId; +use pageserver_client::mgmt_api; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use tokio_util::sync::CancellationToken; +use utils::auth::{Scope, SwappableJwtAuth}; +use utils::failpoint_support::failpoints_handler; +use utils::http::endpoint::{auth_middleware, check_permission_with, request_span}; +use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param}; +use utils::id::{TenantId, TimelineId}; + +use utils::{ + http::{ + endpoint::{self}, + error::ApiError, + json::{json_request, json_response}, + RequestExt, RouterBuilder, + }, + id::NodeId, +}; + +use pageserver_api::controller_api::{ + NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest, + TenantShardMigrateRequest, +}; +use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest}; + +use control_plane::storage_controller::{AttachHookRequest, InspectRequest}; + +use routerify::Middleware; + +/// State available to HTTP request handlers +pub struct HttpState { + service: Arc, + auth: Option>, + neon_metrics: NeonMetrics, + allowlist_routes: Vec, +} + +impl HttpState { + pub fn new( + service: Arc, + auth: Option>, + build_info: BuildInfo, + ) -> Self { + let allowlist_routes = ["/status", "/ready", "/metrics"] + .iter() + .map(|v| v.parse().unwrap()) + .collect::>(); + Self { + service, + auth, + neon_metrics: NeonMetrics::new(build_info), + allowlist_routes, + } + } +} + +#[inline(always)] +fn get_state(request: &Request) -> &HttpState { + request + .data::>() + .expect("unknown state type") + .as_ref() +} + +/// Pageserver calls into this on startup, to learn which tenants it should attach +async fn handle_re_attach(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + + let reattach_req = json_request::(&mut req).await?; + let state = get_state(&req); + json_response(StatusCode::OK, state.service.re_attach(reattach_req).await?) +} + +/// Pageserver calls into this before doing deletions, to confirm that it still +/// holds the latest generation for the tenants with deletions enqueued +async fn handle_validate(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + + let validate_req = json_request::(&mut req).await?; + let state = get_state(&req); + json_response(StatusCode::OK, state.service.validate(validate_req)) +} + +/// Call into this before attaching a tenant to a pageserver, to acquire a generation number +/// (in the real control plane this is unnecessary, because the same program is managing +/// generation numbers and doing attachments). +async fn handle_attach_hook(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let attach_req = json_request::(&mut req).await?; + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .attach_hook(attach_req) + .await + .map_err(ApiError::InternalServerError)?, + ) +} + +async fn handle_inspect(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let inspect_req = json_request::(&mut req).await?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.inspect(inspect_req)) +} + +async fn handle_tenant_create( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::PageServerApi)?; + + let create_req = json_request::(&mut req).await?; + + json_response( + StatusCode::CREATED, + service.tenant_create(create_req).await?, + ) +} + +async fn handle_tenant_location_config( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let config_req = json_request::(&mut req).await?; + json_response( + StatusCode::OK, + service + .tenant_location_config(tenant_shard_id, config_req) + .await?, + ) +} + +async fn handle_tenant_config_set( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::PageServerApi)?; + + let config_req = json_request::(&mut req).await?; + + json_response(StatusCode::OK, service.tenant_config_set(config_req).await?) +} + +async fn handle_tenant_config_get( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?) +} + +async fn handle_tenant_time_travel_remote_storage( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let time_travel_req = json_request::(&mut req).await?; + + let timestamp_raw = must_get_query_param(&req, "travel_to")?; + let _timestamp = humantime::parse_rfc3339(×tamp_raw).map_err(|_e| { + ApiError::BadRequest(anyhow::anyhow!( + "Invalid time for travel_to: {timestamp_raw:?}" + )) + })?; + + let done_if_after_raw = must_get_query_param(&req, "done_if_after")?; + let _done_if_after = humantime::parse_rfc3339(&done_if_after_raw).map_err(|_e| { + ApiError::BadRequest(anyhow::anyhow!( + "Invalid time for done_if_after: {done_if_after_raw:?}" + )) + })?; + + service + .tenant_time_travel_remote_storage( + &time_travel_req, + tenant_id, + timestamp_raw, + done_if_after_raw, + ) + .await?; + json_response(StatusCode::OK, ()) +} + +fn map_reqwest_hyper_status(status: reqwest::StatusCode) -> Result { + hyper::StatusCode::from_u16(status.as_u16()) + .context("invalid status code") + .map_err(ApiError::InternalServerError) +} + +async fn handle_tenant_secondary_download( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis); + + let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?; + json_response(map_reqwest_hyper_status(status)?, progress) +} + +async fn handle_tenant_delete( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let status_code = service + .tenant_delete(tenant_id) + .await + .and_then(map_reqwest_hyper_status)?; + + if status_code == StatusCode::NOT_FOUND { + // The pageserver uses 404 for successful deletion, but we use 200 + json_response(StatusCode::OK, ()) + } else { + json_response(status_code, ()) + } +} + +async fn handle_tenant_timeline_create( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let create_req = json_request::(&mut req).await?; + json_response( + StatusCode::CREATED, + service + .tenant_timeline_create(tenant_id, create_req) + .await?, + ) +} + +async fn handle_tenant_timeline_delete( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + // For timeline deletions, which both implement an "initially return 202, then 404 once + // we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. + async fn deletion_wrapper(service: Arc, f: F) -> Result, ApiError> + where + R: std::future::Future> + Send + 'static, + F: Fn(Arc) -> R + Send + Sync + 'static, + { + let started_at = Instant::now(); + // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion + // completed. + let mut retry_period = Duration::from_secs(1); + // On subsequent retries, wait longer. + let max_retry_period = Duration::from_secs(5); + // Enable callers with a 30 second request timeout to reliably get a response + let max_wait = Duration::from_secs(25); + + loop { + let status = f(service.clone()).await?; + match status { + StatusCode::ACCEPTED => { + tracing::info!("Deletion accepted, waiting to try again..."); + tokio::time::sleep(retry_period).await; + retry_period = max_retry_period; + } + StatusCode::NOT_FOUND => { + tracing::info!("Deletion complete"); + return json_response(StatusCode::OK, ()); + } + _ => { + tracing::warn!("Unexpected status {status}"); + return json_response(status, ()); + } + } + + let now = Instant::now(); + if now + retry_period > started_at + max_wait { + tracing::info!("Deletion timed out waiting for 404"); + // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of + // the pageserver's swagger definition for this endpoint, and has the same desired + // effect of causing the control plane to retry later. + return json_response(StatusCode::CONFLICT, ()); + } + } + } + + deletion_wrapper(service, move |service| async move { + service + .tenant_timeline_delete(tenant_id, timeline_id) + .await + .and_then(map_reqwest_hyper_status) + }) + .await +} + +async fn handle_tenant_timeline_passthrough( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let Some(path) = req.uri().path_and_query() else { + // This should never happen, our request router only calls us if there is a path + return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path"))); + }; + + tracing::info!("Proxying request for tenant {} ({})", tenant_id, path); + + // Find the node that holds shard zero + let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?; + + // Callers will always pass an unsharded tenant ID. Before proxying, we must + // rewrite this to a shard-aware shard zero ID. + let path = format!("{}", path); + let tenant_str = tenant_id.to_string(); + let tenant_shard_str = format!("{}", tenant_shard_id); + let path = path.replace(&tenant_str, &tenant_shard_str); + + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_latency; + + // This is a bit awkward. We remove the param from the request + // and join the words by '_' to get a label for the request. + let just_path = path.replace(&tenant_shard_str, ""); + let path_label = just_path + .split('/') + .filter(|token| !token.is_empty()) + .collect::>() + .join("_"); + let labels = PageserverRequestLabelGroup { + pageserver_id: &node.get_id().to_string(), + path: &path_label, + method: crate::metrics::Method::Get, + }; + + let _timer = latency.start_timer(labels.clone()); + + let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref()); + let resp = client.get_raw(path).await.map_err(|_e| + // FIXME: give APiError a proper Unavailable variant. We return 503 here because + // if we can't successfully send a request to the pageserver, we aren't available. + ApiError::ShuttingDown)?; + + if !resp.status().is_success() { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_passthrough_request_error; + error_counter.inc(labels); + } + + // We have a reqest::Response, would like a http::Response + let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?); + for (k, v) in resp.headers() { + builder = builder.header(k.as_str(), v.as_bytes()); + } + + let response = builder + .body(Body::wrap_stream(resp.bytes_stream())) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + Ok(response) +} + +async fn handle_tenant_locate( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + json_response(StatusCode::OK, service.tenant_locate(tenant_id)?) +} + +async fn handle_tenant_describe( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) +} + +async fn handle_tenant_list( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + json_response(StatusCode::OK, service.tenant_list()) +} + +async fn handle_node_register(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let register_req = json_request::(&mut req).await?; + let state = get_state(&req); + state.service.node_register(register_req).await?; + json_response(StatusCode::OK, ()) +} + +async fn handle_node_list(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let nodes = state.service.node_list().await?; + let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); + + json_response(StatusCode::OK, api_nodes) +} + +async fn handle_node_drop(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + json_response(StatusCode::OK, state.service.node_drop(node_id).await?) +} + +async fn handle_node_configure(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let node_id: NodeId = parse_request_param(&req, "node_id")?; + let config_req = json_request::(&mut req).await?; + if node_id != config_req.node_id { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Path and body node_id differ" + ))); + } + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .node_configure( + config_req.node_id, + config_req.availability.map(NodeAvailability::from), + config_req.scheduling, + ) + .await?, + ) +} + +async fn handle_node_status(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + let node_status = state.service.get_node(node_id).await?; + + json_response(StatusCode::OK, node_status) +} + +async fn handle_node_drain(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + state.service.start_node_drain(node_id).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn handle_node_fill(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + let node_id: NodeId = parse_request_param(&req, "node_id")?; + + state.service.start_node_fill(node_id).await?; + + json_response(StatusCode::ACCEPTED, ()) +} + +async fn handle_tenant_shard_split( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let split_req = json_request::(&mut req).await?; + + json_response( + StatusCode::OK, + service.tenant_shard_split(tenant_id, split_req).await?, + ) +} + +async fn handle_tenant_shard_migrate( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + let migrate_req = json_request::(&mut req).await?; + json_response( + StatusCode::OK, + service + .tenant_shard_migrate(tenant_shard_id, migrate_req) + .await?, + ) +} + +async fn handle_tenant_update_policy(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let update_req = json_request::(&mut req).await?; + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .tenant_update_policy(tenant_id, update_req) + .await?, + ) +} + +async fn handle_tenant_drop(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?) +} + +async fn handle_tenant_import(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let state = get_state(&req); + + json_response( + StatusCode::OK, + state.service.tenant_import(tenant_id).await?, + ) +} + +async fn handle_tenants_dump(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + state.service.tenants_dump() +} + +async fn handle_scheduler_dump(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + state.service.scheduler_dump() +} + +async fn handle_consistency_check(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.consistency_check().await?) +} + +async fn handle_reconcile_all(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let state = get_state(&req); + + json_response(StatusCode::OK, state.service.reconcile_all_now().await?) +} + +/// Status endpoint is just used for checking that our HTTP listener is up +async fn handle_status(_req: Request) -> Result, ApiError> { + json_response(StatusCode::OK, ()) +} + +/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling +/// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe. +async fn handle_ready(req: Request) -> Result, ApiError> { + let state = get_state(&req); + if state.service.startup_complete.is_ready() { + json_response(StatusCode::OK, ()) + } else { + json_response(StatusCode::SERVICE_UNAVAILABLE, ()) + } +} + +impl From for ApiError { + fn from(value: ReconcileError) -> Self { + ApiError::Conflict(format!("Reconciliation error: {}", value)) + } +} + +/// Common wrapper for request handlers that call into Service and will operate on tenants: they must only +/// be allowed to run if Service has finished its initial reconciliation. +async fn tenant_service_handler( + request: Request, + handler: H, + request_name: RequestName, +) -> R::Output +where + R: std::future::Future, ApiError>> + Send + 'static, + H: FnOnce(Arc, Request) -> R + Send + Sync + 'static, +{ + let state = get_state(&request); + let service = state.service.clone(); + + let startup_complete = service.startup_complete.clone(); + if tokio::time::timeout(STARTUP_RECONCILE_TIMEOUT, startup_complete.wait()) + .await + .is_err() + { + // This shouldn't happen: it is the responsibilty of [`Service::startup_reconcile`] to use appropriate + // timeouts around its remote calls, to bound its runtime. + return Err(ApiError::Timeout( + "Timed out waiting for service readiness".into(), + )); + } + + named_request_span( + request, + |request| async move { handler(service, request).await }, + request_name, + ) + .await +} + +/// Check if the required scope is held in the request's token, or if the request has +/// a token with 'admin' scope then always permit it. +fn check_permissions(request: &Request, required_scope: Scope) -> Result<(), ApiError> { + check_permission_with(request, |claims| { + match crate::auth::check_permission(claims, required_scope) { + Err(e) => match crate::auth::check_permission(claims, Scope::Admin) { + Ok(()) => Ok(()), + Err(_) => Err(e), + }, + Ok(()) => Ok(()), + } + }) +} + +#[derive(Clone, Debug)] +struct RequestMeta { + method: hyper::http::Method, + at: Instant, +} + +fn prologue_metrics_middleware( +) -> Middleware { + Middleware::pre(move |req| async move { + let meta = RequestMeta { + method: req.method().clone(), + at: Instant::now(), + }; + + req.set_context(meta); + + Ok(req) + }) +} + +fn epilogue_metrics_middleware( +) -> Middleware { + Middleware::post_with_info(move |resp, req_info| async move { + let request_name = match req_info.context::() { + Some(name) => name, + None => { + return Ok(resp); + } + }; + + if let Some(meta) = req_info.context::() { + let status = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_status; + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_http_request_latency; + + status.inc(HttpRequestStatusLabelGroup { + path: request_name.0, + method: meta.method.clone().into(), + status: crate::metrics::StatusCode(resp.status()), + }); + + latency.observe( + HttpRequestLatencyLabelGroup { + path: request_name.0, + method: meta.method.into(), + }, + meta.at.elapsed().as_secs_f64(), + ); + } + Ok(resp) + }) +} + +pub async fn measured_metrics_handler(req: Request) -> Result, ApiError> { + pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4"; + + let state = get_state(&req); + let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics); + let response = Response::builder() + .status(200) + .header(CONTENT_TYPE, TEXT_FORMAT) + .body(payload.into()) + .unwrap(); + + Ok(response) +} + +#[derive(Clone)] +struct RequestName(&'static str); + +async fn named_request_span( + request: Request, + handler: H, + name: RequestName, +) -> R::Output +where + R: Future, ApiError>> + Send + 'static, + H: FnOnce(Request) -> R + Send + Sync + 'static, +{ + request.set_context(name); + request_span(request, handler).await +} + +pub fn make_router( + service: Arc, + auth: Option>, + build_info: BuildInfo, +) -> RouterBuilder { + let mut router = endpoint::make_router() + .middleware(prologue_metrics_middleware()) + .middleware(epilogue_metrics_middleware()); + if auth.is_some() { + router = router.middleware(auth_middleware(|request| { + let state = get_state(request); + if state.allowlist_routes.contains(request.uri()) { + None + } else { + state.auth.as_deref() + } + })); + } + + router + .data(Arc::new(HttpState::new(service, auth, build_info))) + .get("/metrics", |r| { + named_request_span(r, measured_metrics_handler, RequestName("metrics")) + }) + // Non-prefixed generic endpoints (status, metrics) + .get("/status", |r| { + named_request_span(r, handle_status, RequestName("status")) + }) + .get("/ready", |r| { + named_request_span(r, handle_ready, RequestName("ready")) + }) + // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix + .post("/upcall/v1/re-attach", |r| { + named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach")) + }) + .post("/upcall/v1/validate", |r| { + named_request_span(r, handle_validate, RequestName("upcall_v1_validate")) + }) + // Test/dev/debug endpoints + .post("/debug/v1/attach-hook", |r| { + named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook")) + }) + .post("/debug/v1/inspect", |r| { + named_request_span(r, handle_inspect, RequestName("debug_v1_inspect")) + }) + .post("/debug/v1/tenant/:tenant_id/drop", |r| { + named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop")) + }) + .post("/debug/v1/node/:node_id/drop", |r| { + named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop")) + }) + .post("/debug/v1/tenant/:tenant_id/import", |r| { + named_request_span( + r, + handle_tenant_import, + RequestName("debug_v1_tenant_import"), + ) + }) + .get("/debug/v1/tenant", |r| { + named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant")) + }) + .get("/debug/v1/tenant/:tenant_id/locate", |r| { + tenant_service_handler( + r, + handle_tenant_locate, + RequestName("debug_v1_tenant_locate"), + ) + }) + .get("/debug/v1/scheduler", |r| { + named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) + }) + .post("/debug/v1/consistency_check", |r| { + named_request_span( + r, + handle_consistency_check, + RequestName("debug_v1_consistency_check"), + ) + }) + .post("/debug/v1/reconcile_all", |r| { + request_span(r, handle_reconcile_all) + }) + .put("/debug/v1/failpoints", |r| { + request_span(r, |r| failpoints_handler(r, CancellationToken::new())) + }) + // Node operations + .post("/control/v1/node", |r| { + named_request_span(r, handle_node_register, RequestName("control_v1_node")) + }) + .get("/control/v1/node", |r| { + named_request_span(r, handle_node_list, RequestName("control_v1_node")) + }) + .put("/control/v1/node/:node_id/config", |r| { + named_request_span( + r, + handle_node_configure, + RequestName("control_v1_node_config"), + ) + }) + .get("/control/v1/node/:node_id", |r| { + named_request_span(r, handle_node_status, RequestName("control_v1_node_status")) + }) + .put("/control/v1/node/:node_id/drain", |r| { + named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain")) + }) + .put("/control/v1/node/:node_id/fill", |r| { + named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill")) + }) + // TODO(vlad): endpoint for cancelling drain and fill + // Tenant Shard operations + .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| { + tenant_service_handler( + r, + handle_tenant_shard_migrate, + RequestName("control_v1_tenant_migrate"), + ) + }) + .put("/control/v1/tenant/:tenant_id/shard_split", |r| { + tenant_service_handler( + r, + handle_tenant_shard_split, + RequestName("control_v1_tenant_shard_split"), + ) + }) + .get("/control/v1/tenant/:tenant_id", |r| { + tenant_service_handler( + r, + handle_tenant_describe, + RequestName("control_v1_tenant_describe"), + ) + }) + .get("/control/v1/tenant", |r| { + tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list")) + }) + .put("/control/v1/tenant/:tenant_id/policy", |r| { + named_request_span( + r, + handle_tenant_update_policy, + RequestName("control_v1_tenant_policy"), + ) + }) + // Tenant operations + // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into + // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. + .post("/v1/tenant", |r| { + tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant")) + }) + .delete("/v1/tenant/:tenant_id", |r| { + tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant")) + }) + .put("/v1/tenant/config", |r| { + tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config")) + }) + .get("/v1/tenant/:tenant_id/config", |r| { + tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config")) + }) + .put("/v1/tenant/:tenant_shard_id/location_config", |r| { + tenant_service_handler( + r, + handle_tenant_location_config, + RequestName("v1_tenant_location_config"), + ) + }) + .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| { + tenant_service_handler( + r, + handle_tenant_time_travel_remote_storage, + RequestName("v1_tenant_time_travel_remote_storage"), + ) + }) + .post("/v1/tenant/:tenant_id/secondary/download", |r| { + tenant_service_handler( + r, + handle_tenant_secondary_download, + RequestName("v1_tenant_secondary_download"), + ) + }) + // Timeline operations + .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + tenant_service_handler( + r, + handle_tenant_timeline_delete, + RequestName("v1_tenant_timeline"), + ) + }) + .post("/v1/tenant/:tenant_id/timeline", |r| { + tenant_service_handler( + r, + handle_tenant_timeline_create, + RequestName("v1_tenant_timeline"), + ) + }) + // Tenant detail GET passthrough to shard zero: + .get("/v1/tenant/:tenant_id", |r| { + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_passthrough"), + ) + }) + // The `*` in the URL is a wildcard: any tenant/timeline GET APIs on the pageserver + // are implicitly exposed here. This must be last in the list to avoid + // taking precedence over other GET methods we might implement by hand. + .get("/v1/tenant/:tenant_id/*", |r| { + tenant_service_handler( + r, + handle_tenant_timeline_passthrough, + RequestName("v1_tenant_passthrough"), + ) + }) +} diff --git a/storage_controller/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs new file mode 100644 index 0000000000..dff793289f --- /dev/null +++ b/storage_controller/src/id_lock_map.rs @@ -0,0 +1,189 @@ +use std::fmt::Display; +use std::time::Instant; +use std::{collections::HashMap, sync::Arc}; + +use std::time::Duration; + +use crate::service::RECONCILE_TIMEOUT; + +const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT; + +/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the +/// current holding operation in lock. +pub struct WrappedWriteGuard { + guard: tokio::sync::OwnedRwLockWriteGuard>, + start: Instant, +} + +impl WrappedWriteGuard { + pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard>) -> Self { + Self { + guard, + start: Instant::now(), + } + } +} + +impl Drop for WrappedWriteGuard { + fn drop(&mut self) { + let duration = self.start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Lock on {} was held for {:?}", + self.guard.as_ref().unwrap(), + duration + ); + } + *self.guard = None; + } +} + +/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't +/// want to embed a lock in each one, or if your locking granularity is different to your object granularity. +/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking +/// is needed at a tenant-wide granularity. +pub(crate) struct IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + /// A synchronous lock for getting/setting the async locks that our callers will wait on. + entities: std::sync::Mutex>>>>, +} + +impl IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, + I: Display, +{ + pub(crate) fn shared( + &self, + key: T, + ) -> impl std::future::Future>> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default(); + entry.clone().read_owned() + } + + pub(crate) fn exclusive( + &self, + key: T, + operation: I, + ) -> impl std::future::Future> { + let mut locked = self.entities.lock().unwrap(); + let entry = locked.entry(key).or_default().clone(); + async move { + let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await); + *guard.guard = Some(operation); + guard + } + } + + /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do + /// periodic housekeeping to avoid the map growing indefinitely + pub(crate) fn housekeeping(&self) { + let mut locked = self.entities.lock().unwrap(); + locked.retain(|_k, entry| entry.try_write().is_err()) + } +} + +impl Default for IdLockMap +where + T: Eq + PartialEq + std::hash::Hash, +{ + fn default() -> Self { + Self { + entities: std::sync::Mutex::new(HashMap::new()), + } + } +} + +pub async fn trace_exclusive_lock< + T: Clone + Display + Eq + PartialEq + std::hash::Hash, + I: Display + Clone, +>( + op_locks: &IdLockMap, + key: T, + operation: I, +) -> WrappedWriteGuard { + let start = Instant::now(); + let guard = op_locks.exclusive(key.clone(), operation.clone()).await; + + let duration = start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Operation {} on key {} has waited {:?} for exclusive lock", + operation, + key, + duration + ); + } + + guard +} + +pub async fn trace_shared_lock< + T: Clone + Display + Eq + PartialEq + std::hash::Hash, + I: Display, +>( + op_locks: &IdLockMap, + key: T, + operation: I, +) -> tokio::sync::OwnedRwLockReadGuard> { + let start = Instant::now(); + let guard = op_locks.shared(key.clone()).await; + + let duration = start.elapsed(); + if duration > LOCK_TIMEOUT_ALERT_THRESHOLD { + tracing::warn!( + "Operation {} on key {} has waited {:?} for shared lock", + operation, + key, + duration + ); + } + + guard +} + +#[cfg(test)] +mod tests { + use super::IdLockMap; + + #[derive(Clone, Debug, strum_macros::Display, PartialEq)] + enum Operations { + Op1, + Op2, + } + + #[tokio::test] + async fn multiple_shared_locks() { + let id_lock_map: IdLockMap = IdLockMap::default(); + + let shared_lock_1 = id_lock_map.shared(1).await; + let shared_lock_2 = id_lock_map.shared(1).await; + + assert!(shared_lock_1.is_none()); + assert!(shared_lock_2.is_none()); + } + + #[tokio::test] + async fn exclusive_locks() { + let id_lock_map = IdLockMap::default(); + let resource_id = 1; + + { + let _ex_lock = id_lock_map.exclusive(resource_id, Operations::Op1).await; + assert_eq!(_ex_lock.guard.clone().unwrap(), Operations::Op1); + + let _ex_lock_2 = tokio::time::timeout( + tokio::time::Duration::from_millis(1), + id_lock_map.exclusive(resource_id, Operations::Op2), + ) + .await; + assert!(_ex_lock_2.is_err()); + } + + let shared_lock_1 = id_lock_map.shared(resource_id).await; + assert!(shared_lock_1.is_none()); + } +} diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs new file mode 100644 index 0000000000..8caf638904 --- /dev/null +++ b/storage_controller/src/lib.rs @@ -0,0 +1,55 @@ +use serde::Serialize; +use utils::seqwait::MonotonicCounter; + +mod auth; +mod background_node_operations; +mod compute_hook; +mod heartbeater; +pub mod http; +mod id_lock_map; +pub mod metrics; +mod node; +mod pageserver_client; +pub mod persistence; +mod reconciler; +mod scheduler; +mod schema; +pub mod service; +mod tenant_shard; + +#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)] +struct Sequence(u64); + +impl Sequence { + fn initial() -> Self { + Self(0) + } +} + +impl std::fmt::Display for Sequence { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::fmt::Debug for Sequence { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl MonotonicCounter for Sequence { + fn cnt_advance(&mut self, v: Sequence) { + assert!(*self <= v); + *self = v; + } + fn cnt_value(&self) -> Sequence { + *self + } +} + +impl Sequence { + fn next(&self) -> Sequence { + Sequence(self.0 + 1) + } +} diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs new file mode 100644 index 0000000000..f1eb0b30fc --- /dev/null +++ b/storage_controller/src/main.rs @@ -0,0 +1,339 @@ +use anyhow::{anyhow, Context}; +use camino::Utf8PathBuf; +use clap::Parser; +use diesel::Connection; +use metrics::launch_timestamp::LaunchTimestamp; +use metrics::BuildInfo; +use std::path::PathBuf; +use std::sync::Arc; +use storage_controller::http::make_router; +use storage_controller::metrics::preinitialize_metrics; +use storage_controller::persistence::Persistence; +use storage_controller::service::{ + Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, +}; +use tokio::signal::unix::SignalKind; +use tokio_util::sync::CancellationToken; +use utils::auth::{JwtAuth, SwappableJwtAuth}; +use utils::logging::{self, LogFormat}; + +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version, tcp_listener}; + +project_git_version!(GIT_VERSION); +project_build_tag!(BUILD_TAG); + +use diesel_migrations::{embed_migrations, EmbeddedMigrations}; +pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations"); + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +#[command(arg_required_else_help(true))] +struct Cli { + /// Host and port to listen on, like `127.0.0.1:1234` + #[arg(short, long)] + listen: std::net::SocketAddr, + + /// Public key for JWT authentication of clients + #[arg(long)] + public_key: Option, + + /// Token for authenticating this service with the pageservers it controls + #[arg(long)] + jwt_token: Option, + + /// Token for authenticating this service with the control plane, when calling + /// the compute notification endpoint + #[arg(long)] + control_plane_jwt_token: Option, + + /// URL to control plane compute notification endpoint + #[arg(long)] + compute_hook_url: Option, + + /// Path to the .json file to store state (will be created if it doesn't exist) + #[arg(short, long)] + path: Option, + + /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller + #[arg(long)] + database_url: Option, + + /// Flag to enable dev mode, which permits running without auth + #[arg(long, default_value = "false")] + dev: bool, + + /// Grace period before marking unresponsive pageserver offline + #[arg(long)] + max_unavailable_interval: Option, + + /// Size threshold for automatically splitting shards (disabled by default) + #[arg(long)] + split_threshold: Option, + + /// Maximum number of reconcilers that may run in parallel + #[arg(long)] + reconciler_concurrency: Option, + + /// How long to wait for the initial database connection to be available. + #[arg(long, default_value = "5s")] + db_connect_timeout: humantime::Duration, + + /// `neon_local` sets this to the path of the neon_local repo dir. + /// Only relevant for testing. + // TODO: make `cfg(feature = "testing")` + #[arg(long)] + neon_local_repo_dir: Option, +} + +enum StrictMode { + /// In strict mode, we will require that all secrets are loaded, i.e. security features + /// may not be implicitly turned off by omitting secrets in the environment. + Strict, + /// In dev mode, secrets are optional, and omitting a particular secret will implicitly + /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated + /// requests, no public key -> don't authenticate incoming requests). + Dev, +} + +impl Default for StrictMode { + fn default() -> Self { + Self::Strict + } +} + +/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this +/// type encapsulates the logic to decide which and do the loading. +struct Secrets { + database_url: String, + public_key: Option, + jwt_token: Option, + control_plane_jwt_token: Option, +} + +impl Secrets { + const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; + const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; + const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; + const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY"; + + /// Load secrets from, in order of preference: + /// - CLI args if database URL is provided on the CLI + /// - Environment variables if DATABASE_URL is set. + /// - AWS Secrets Manager secrets + async fn load(args: &Cli) -> anyhow::Result { + let Some(database_url) = + Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await + else { + anyhow::bail!( + "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)" + ) + }; + + let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await { + Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?), + None => None, + }; + + let this = Self { + database_url, + public_key, + jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await, + control_plane_jwt_token: Self::load_secret( + &args.control_plane_jwt_token, + Self::CONTROL_PLANE_JWT_TOKEN_ENV, + ) + .await, + }; + + Ok(this) + } + + async fn load_secret(cli: &Option, env_name: &str) -> Option { + if let Some(v) = cli { + Some(v.clone()) + } else if let Ok(v) = std::env::var(env_name) { + Some(v) + } else { + None + } + } +} + +/// Execute the diesel migrations that are built into this binary +async fn migration_run(database_url: &str) -> anyhow::Result<()> { + use diesel::PgConnection; + use diesel_migrations::{HarnessWithOutput, MigrationHarness}; + let mut conn = PgConnection::establish(database_url)?; + + HarnessWithOutput::write_to_stdout(&mut conn) + .run_pending_migrations(MIGRATIONS) + .map(|_| ()) + .map_err(|e| anyhow::anyhow!(e))?; + + Ok(()) +} + +fn main() -> anyhow::Result<()> { + let default_panic = std::panic::take_hook(); + std::panic::set_hook(Box::new(move |info| { + default_panic(info); + std::process::exit(1); + })); + + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + + tokio::runtime::Builder::new_current_thread() + // We use spawn_blocking for database operations, so require approximately + // as many blocking threads as we will open database connections. + .max_blocking_threads(Persistence::MAX_CONNECTIONS as usize) + .enable_all() + .build() + .unwrap() + .block_on(async_main()) +} + +async fn async_main() -> anyhow::Result<()> { + let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate())); + + logging::init( + LogFormat::Plain, + logging::TracingErrorLayerEnablement::Disabled, + logging::Output::Stdout, + )?; + + preinitialize_metrics(); + + let args = Cli::parse(); + tracing::info!( + "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}", + GIT_VERSION, + launch_ts.to_string(), + BUILD_TAG, + args.path.as_ref().unwrap_or(&Utf8PathBuf::from("")), + args.listen + ); + + let build_info = BuildInfo { + revision: GIT_VERSION, + build_tag: BUILD_TAG, + }; + + let strict_mode = if args.dev { + StrictMode::Dev + } else { + StrictMode::Strict + }; + + let secrets = Secrets::load(&args).await?; + + // Validate required secrets and arguments are provided in strict mode + match strict_mode { + StrictMode::Strict + if (secrets.public_key.is_none() + || secrets.jwt_token.is_none() + || secrets.control_plane_jwt_token.is_none()) => + { + // Production systems should always have secrets configured: if public_key was not set + // then we would implicitly disable auth. + anyhow::bail!( + "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" + ); + } + StrictMode::Strict if args.compute_hook_url.is_none() => { + // Production systems should always have a compute hook set, to prevent falling + // back to trying to use neon_local. + anyhow::bail!( + "`--compute-hook-url` is not set: this is only permitted in `--dev` mode" + ); + } + StrictMode::Strict => { + tracing::info!("Starting in strict mode: configuration is OK.") + } + StrictMode::Dev => { + tracing::warn!("Starting in dev mode: this may be an insecure configuration.") + } + } + + let config = Config { + jwt_token: secrets.jwt_token, + control_plane_jwt_token: secrets.control_plane_jwt_token, + compute_hook_url: args.compute_hook_url, + max_unavailable_interval: args + .max_unavailable_interval + .map(humantime::Duration::into) + .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT), + reconciler_concurrency: args + .reconciler_concurrency + .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT), + split_threshold: args.split_threshold, + neon_local_repo_dir: args.neon_local_repo_dir, + }; + + // After loading secrets & config, but before starting anything else, apply database migrations + Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?; + + migration_run(&secrets.database_url) + .await + .context("Running database migrations")?; + + let json_path = args.path; + let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone())); + + let service = Service::spawn(config, persistence.clone()).await?; + + let http_listener = tcp_listener::bind(args.listen)?; + + let auth = secrets + .public_key + .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth))); + let router = make_router(service.clone(), auth, build_info) + .build() + .map_err(|err| anyhow!(err))?; + let router_service = utils::http::RouterService::new(router).unwrap(); + + // Start HTTP server + let server_shutdown = CancellationToken::new(); + let server = hyper::Server::from_tcp(http_listener)? + .serve(router_service) + .with_graceful_shutdown({ + let server_shutdown = server_shutdown.clone(); + async move { + server_shutdown.cancelled().await; + } + }); + tracing::info!("Serving on {0}", args.listen); + let server_task = tokio::task::spawn(server); + + // Wait until we receive a signal + let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?; + let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?; + let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate())?; + tokio::select! { + _ = sigint.recv() => {}, + _ = sigterm.recv() => {}, + _ = sigquit.recv() => {}, + } + tracing::info!("Terminating on signal"); + + if json_path.is_some() { + // Write out a JSON dump on shutdown: this is used in compat tests to avoid passing + // full postgres dumps around. + if let Err(e) = persistence.write_tenants_json().await { + tracing::error!("Failed to write JSON on shutdown: {e}") + } + } + + // Stop HTTP server first, so that we don't have to service requests + // while shutting down Service + server_shutdown.cancel(); + if let Err(e) = server_task.await { + tracing::error!("Error joining HTTP server task: {e}") + } + tracing::info!("Joined HTTP server task"); + + service.shutdown().await; + tracing::info!("Service shutdown complete"); + + std::process::exit(0); +} diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs new file mode 100644 index 0000000000..ac9f22c739 --- /dev/null +++ b/storage_controller/src/metrics.rs @@ -0,0 +1,232 @@ +//! +//! This module provides metric definitions for the storage controller. +//! +//! All metrics are grouped in [`StorageControllerMetricGroup`]. [`StorageControllerMetrics`] holds +//! the mentioned metrics and their encoder. It's globally available via the [`METRICS_REGISTRY`] +//! constant. +//! +//! The rest of the code defines label group types and deals with converting outer types to labels. +//! +use bytes::Bytes; +use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup}; +use metrics::NeonMetrics; +use once_cell::sync::Lazy; +use std::sync::Mutex; + +use crate::persistence::{DatabaseError, DatabaseOperation}; + +pub(crate) static METRICS_REGISTRY: Lazy = + Lazy::new(StorageControllerMetrics::default); + +pub fn preinitialize_metrics() { + Lazy::force(&METRICS_REGISTRY); +} + +pub(crate) struct StorageControllerMetrics { + pub(crate) metrics_group: StorageControllerMetricGroup, + encoder: Mutex, +} + +#[derive(measured::MetricGroup)] +#[metric(new())] +pub(crate) struct StorageControllerMetricGroup { + /// Count of how many times we spawn a reconcile task + pub(crate) storage_controller_reconcile_spawn: measured::Counter, + + /// Reconciler tasks completed, broken down by success/failure/cancelled + pub(crate) storage_controller_reconcile_complete: + measured::CounterVec, + + /// Count of how many times we make an optimization change to a tenant's scheduling + pub(crate) storage_controller_schedule_optimization: measured::Counter, + + /// HTTP request status counters for handled requests + pub(crate) storage_controller_http_request_status: + measured::CounterVec, + + /// HTTP request handler latency across all status codes + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_http_request_latency: + measured::HistogramVec, + + /// Count of HTTP requests to the pageserver that resulted in an error, + /// broken down by the pageserver node id, request name and method + pub(crate) storage_controller_pageserver_request_error: + measured::CounterVec, + + /// Latency of HTTP requests to the pageserver, broken down by pageserver + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_pageserver_request_latency: + measured::HistogramVec, + + /// Count of pass-through HTTP requests to the pageserver that resulted in an error, + /// broken down by the pageserver node id, request name and method + pub(crate) storage_controller_passthrough_request_error: + measured::CounterVec, + + /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver + /// node id, request name and method. This include both successful and unsuccessful + /// requests. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_passthrough_request_latency: + measured::HistogramVec, + + /// Count of errors in database queries, broken down by error type and operation. + pub(crate) storage_controller_database_query_error: + measured::CounterVec, + + /// Latency of database queries, broken down by operation. + #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))] + pub(crate) storage_controller_database_query_latency: + measured::HistogramVec, +} + +impl StorageControllerMetrics { + pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes { + let mut encoder = self.encoder.lock().unwrap(); + neon_metrics + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + self.metrics_group + .collect_group_into(&mut *encoder) + .unwrap_or_else(|infallible| match infallible {}); + encoder.finish() + } +} + +impl Default for StorageControllerMetrics { + fn default() -> Self { + let mut metrics_group = StorageControllerMetricGroup::new(); + metrics_group + .storage_controller_reconcile_complete + .init_all_dense(); + + Self { + metrics_group, + encoder: Mutex::new(measured::text::BufferedTextEncoder::new()), + } + } +} + +#[derive(measured::LabelGroup)] +#[label(set = ReconcileCompleteLabelGroupSet)] +pub(crate) struct ReconcileCompleteLabelGroup { + pub(crate) status: ReconcileOutcome, +} + +#[derive(measured::LabelGroup)] +#[label(set = HttpRequestStatusLabelGroupSet)] +pub(crate) struct HttpRequestStatusLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) path: &'a str, + pub(crate) method: Method, + pub(crate) status: StatusCode, +} + +#[derive(measured::LabelGroup)] +#[label(set = HttpRequestLatencyLabelGroupSet)] +pub(crate) struct HttpRequestLatencyLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) path: &'a str, + pub(crate) method: Method, +} + +#[derive(measured::LabelGroup, Clone)] +#[label(set = PageserverRequestLabelGroupSet)] +pub(crate) struct PageserverRequestLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) pageserver_id: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) path: &'a str, + pub(crate) method: Method, +} + +#[derive(measured::LabelGroup)] +#[label(set = DatabaseQueryErrorLabelGroupSet)] +pub(crate) struct DatabaseQueryErrorLabelGroup { + pub(crate) error_type: DatabaseErrorLabel, + pub(crate) operation: DatabaseOperation, +} + +#[derive(measured::LabelGroup)] +#[label(set = DatabaseQueryLatencyLabelGroupSet)] +pub(crate) struct DatabaseQueryLatencyLabelGroup { + pub(crate) operation: DatabaseOperation, +} + +#[derive(FixedCardinalityLabel, Clone, Copy)] +pub(crate) enum ReconcileOutcome { + #[label(rename = "ok")] + Success, + Error, + Cancel, +} + +#[derive(FixedCardinalityLabel, Copy, Clone)] +pub(crate) enum Method { + Get, + Put, + Post, + Delete, + Other, +} + +impl From for Method { + fn from(value: hyper::Method) -> Self { + if value == hyper::Method::GET { + Method::Get + } else if value == hyper::Method::PUT { + Method::Put + } else if value == hyper::Method::POST { + Method::Post + } else if value == hyper::Method::DELETE { + Method::Delete + } else { + Method::Other + } + } +} + +#[derive(Clone, Copy)] +pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode); + +impl LabelValue for StatusCode { + fn visit(&self, v: V) -> V::Output { + v.write_int(self.0.as_u16() as i64) + } +} + +impl FixedCardinalityLabel for StatusCode { + fn cardinality() -> usize { + (100..1000).len() + } + + fn encode(&self) -> usize { + self.0.as_u16() as usize + } + + fn decode(value: usize) -> Self { + Self(hyper::http::StatusCode::from_u16(u16::try_from(value).unwrap()).unwrap()) + } +} + +#[derive(FixedCardinalityLabel, Clone, Copy)] +pub(crate) enum DatabaseErrorLabel { + Query, + Connection, + ConnectionPool, + Logical, +} + +impl DatabaseError { + pub(crate) fn error_label(&self) -> DatabaseErrorLabel { + match self { + Self::Query(_) => DatabaseErrorLabel::Query, + Self::Connection(_) => DatabaseErrorLabel::Connection, + Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool, + Self::Logical(_) => DatabaseErrorLabel::Logical, + } + } +} diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs new file mode 100644 index 0000000000..4d17dff9fe --- /dev/null +++ b/storage_controller/src/node.rs @@ -0,0 +1,300 @@ +use std::{str::FromStr, time::Duration}; + +use pageserver_api::{ + controller_api::{ + NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, + TenantLocateResponseShard, UtilizationScore, + }, + shard::TenantShardId, +}; +use pageserver_client::mgmt_api; +use reqwest::StatusCode; +use serde::Serialize; +use tokio_util::sync::CancellationToken; +use utils::{backoff, id::NodeId}; + +use crate::{ + pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule, +}; + +/// Represents the in-memory description of a Node. +/// +/// Scheduling statistics are maintened separately in [`crate::scheduler`]. +/// +/// The persistent subset of the Node is defined in [`crate::persistence::NodePersistence`]: the +/// implementation of serialization on this type is only for debug dumps. +#[derive(Clone, Serialize)] +pub(crate) struct Node { + id: NodeId, + + availability: NodeAvailability, + scheduling: NodeSchedulingPolicy, + + listen_http_addr: String, + listen_http_port: u16, + + listen_pg_addr: String, + listen_pg_port: u16, + + // This cancellation token means "stop any RPCs in flight to this node, and don't start + // any more". It is not related to process shutdown. + #[serde(skip)] + cancel: CancellationToken, +} + +/// When updating [`Node::availability`] we use this type to indicate to the caller +/// whether/how they changed it. +pub(crate) enum AvailabilityTransition { + ToActive, + ToOffline, + Unchanged, +} + +impl Node { + pub(crate) fn base_url(&self) -> String { + format!("http://{}:{}", self.listen_http_addr, self.listen_http_port) + } + + pub(crate) fn get_id(&self) -> NodeId { + self.id + } + + pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy { + self.scheduling + } + + pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) { + self.scheduling = scheduling + } + + /// Does this registration request match `self`? This is used when deciding whether a registration + /// request should be allowed to update an existing record with the same node ID. + pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool { + self.id == register_req.node_id + && self.listen_http_addr == register_req.listen_http_addr + && self.listen_http_port == register_req.listen_http_port + && self.listen_pg_addr == register_req.listen_pg_addr + && self.listen_pg_port == register_req.listen_pg_port + } + + /// For a shard located on this node, populate a response object + /// with this node's address information. + pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard { + TenantLocateResponseShard { + shard_id, + node_id: self.id, + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port, + } + } + + pub(crate) fn set_availability(&mut self, availability: NodeAvailability) { + match self.get_availability_transition(availability) { + AvailabilityTransition::ToActive => { + // Give the node a new cancellation token, effectively resetting it to un-cancelled. Any + // users of previously-cloned copies of the node will still see the old cancellation + // state. For example, Reconcilers in flight will have to complete and be spawned + // again to realize that the node has become available. + self.cancel = CancellationToken::new(); + } + AvailabilityTransition::ToOffline => { + // Fire the node's cancellation token to cancel any in-flight API requests to it + self.cancel.cancel(); + } + AvailabilityTransition::Unchanged => {} + } + self.availability = availability; + } + + /// Without modifying the availability of the node, convert the intended availability + /// into a description of the transition. + pub(crate) fn get_availability_transition( + &self, + availability: NodeAvailability, + ) -> AvailabilityTransition { + use AvailabilityTransition::*; + use NodeAvailability::*; + + match (self.availability, availability) { + (Offline, Active(_)) => ToActive, + (Active(_), Offline) => ToOffline, + // Consider the case when the storage controller handles the re-attach of a node + // before the heartbeats detect that the node is back online. We still need + // [`Service::node_configure`] to attempt reconciliations for shards with an + // unknown observed location. + // The unsavoury match arm below handles this situation. + (Active(lhs), Active(rhs)) + if lhs == UtilizationScore::worst() && rhs < UtilizationScore::worst() => + { + ToActive + } + _ => Unchanged, + } + } + + /// Whether we may send API requests to this node. + pub(crate) fn is_available(&self) -> bool { + // When we clone a node, [`Self::availability`] is a snapshot, but [`Self::cancel`] holds + // a reference to the original Node's cancellation status. Checking both of these results + // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable + // when we cloned it, or if the original Node instance's cancellation token was fired. + matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled() + } + + /// Is this node elegible to have work scheduled onto it? + pub(crate) fn may_schedule(&self) -> MaySchedule { + let score = match self.availability { + NodeAvailability::Active(score) => score, + NodeAvailability::Offline => return MaySchedule::No, + }; + + match self.scheduling { + NodeSchedulingPolicy::Active => MaySchedule::Yes(score), + NodeSchedulingPolicy::Draining => MaySchedule::No, + NodeSchedulingPolicy::Filling => MaySchedule::Yes(score), + NodeSchedulingPolicy::Pause => MaySchedule::No, + NodeSchedulingPolicy::PauseForRestart => MaySchedule::No, + } + } + + pub(crate) fn new( + id: NodeId, + listen_http_addr: String, + listen_http_port: u16, + listen_pg_addr: String, + listen_pg_port: u16, + ) -> Self { + Self { + id, + listen_http_addr, + listen_http_port, + listen_pg_addr, + listen_pg_port, + scheduling: NodeSchedulingPolicy::Active, + availability: NodeAvailability::Offline, + cancel: CancellationToken::new(), + } + } + + pub(crate) fn to_persistent(&self) -> NodePersistence { + NodePersistence { + node_id: self.id.0 as i64, + scheduling_policy: self.scheduling.into(), + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port as i32, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port as i32, + } + } + + pub(crate) fn from_persistent(np: NodePersistence) -> Self { + Self { + id: NodeId(np.node_id as u64), + // At startup we consider a node offline until proven otherwise. + availability: NodeAvailability::Offline, + scheduling: NodeSchedulingPolicy::from_str(&np.scheduling_policy) + .expect("Bad scheduling policy in DB"), + listen_http_addr: np.listen_http_addr, + listen_http_port: np.listen_http_port as u16, + listen_pg_addr: np.listen_pg_addr, + listen_pg_port: np.listen_pg_port as u16, + cancel: CancellationToken::new(), + } + } + + /// Wrapper for issuing requests to pageserver management API: takes care of generic + /// retry/backoff for retryable HTTP status codes. + /// + /// This will return None to indicate cancellation. Cancellation may happen from + /// the cancellation token passed in, or from Self's cancellation token (i.e. node + /// going offline). + pub(crate) async fn with_client_retries( + &self, + mut op: O, + jwt: &Option, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> Option> + where + O: FnMut(PageserverClient) -> F, + F: std::future::Future>, + { + fn is_fatal(e: &mgmt_api::Error) -> bool { + use mgmt_api::Error::*; + match e { + ReceiveBody(_) | ReceiveErrorBody(_) => false, + ApiError(StatusCode::SERVICE_UNAVAILABLE, _) + | ApiError(StatusCode::GATEWAY_TIMEOUT, _) + | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, + ApiError(_, _) => true, + Cancelled => true, + } + } + + backoff::retry( + || { + let http_client = reqwest::ClientBuilder::new() + .timeout(timeout) + .build() + .expect("Failed to construct HTTP client"); + + let client = PageserverClient::from_client( + self.get_id(), + http_client, + self.base_url(), + jwt.as_deref(), + ); + + let node_cancel_fut = self.cancel.cancelled(); + + let op_fut = op(client); + + async { + tokio::select! { + r = op_fut=> {r}, + _ = node_cancel_fut => { + Err(mgmt_api::Error::Cancelled) + }} + } + }, + is_fatal, + warn_threshold, + max_retries, + &format!( + "Call to node {} ({}:{}) management API", + self.id, self.listen_http_addr, self.listen_http_port + ), + cancel, + ) + .await + } + + /// Generate the simplified API-friendly description of a node's state + pub(crate) fn describe(&self) -> NodeDescribeResponse { + NodeDescribeResponse { + id: self.id, + availability: self.availability.into(), + scheduling: self.scheduling, + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port, + } + } +} + +impl std::fmt::Display for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.id, self.listen_http_addr) + } +} + +impl std::fmt::Debug for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{} ({})", self.id, self.listen_http_addr) + } +} diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs new file mode 100644 index 0000000000..769aba80ca --- /dev/null +++ b/storage_controller/src/pageserver_client.rs @@ -0,0 +1,249 @@ +use pageserver_api::{ + models::{ + LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, + TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, + TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, + }, + shard::TenantShardId, +}; +use pageserver_client::mgmt_api::{Client, Result}; +use reqwest::StatusCode; +use utils::id::{NodeId, TenantId, TimelineId}; + +/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage +/// controller to collect metrics in a non-intrusive manner. +#[derive(Debug, Clone)] +pub(crate) struct PageserverClient { + inner: Client, + node_id_label: String, +} + +macro_rules! measured_request { + ($name:literal, $method:expr, $node_id: expr, $invoke:expr) => {{ + let labels = crate::metrics::PageserverRequestLabelGroup { + pageserver_id: $node_id, + path: $name, + method: $method, + }; + + let latency = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_latency; + let _timer_guard = latency.start_timer(labels.clone()); + + let res = $invoke; + + if res.is_err() { + let error_counters = &crate::metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pageserver_request_error; + error_counters.inc(labels) + } + + res + }}; +} + +impl PageserverClient { + pub(crate) fn new(node_id: NodeId, mgmt_api_endpoint: String, jwt: Option<&str>) -> Self { + Self { + inner: Client::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) fn from_client( + node_id: NodeId, + raw_client: reqwest::Client, + mgmt_api_endpoint: String, + jwt: Option<&str>, + ) -> Self { + Self { + inner: Client::from_client(raw_client, mgmt_api_endpoint, jwt), + node_id_label: node_id.0.to_string(), + } + } + + pub(crate) async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result { + measured_request!( + "tenant", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner.tenant_delete(tenant_shard_id).await + ) + } + + pub(crate) async fn tenant_time_travel_remote_storage( + &self, + tenant_shard_id: TenantShardId, + timestamp: &str, + done_if_after: &str, + ) -> Result<()> { + measured_request!( + "tenant_time_travel_remote_storage", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .tenant_time_travel_remote_storage(tenant_shard_id, timestamp, done_if_after) + .await + ) + } + + pub(crate) async fn tenant_scan_remote_storage( + &self, + tenant_id: TenantId, + ) -> Result { + measured_request!( + "tenant_scan_remote_storage", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.tenant_scan_remote_storage(tenant_id).await + ) + } + + pub(crate) async fn tenant_secondary_download( + &self, + tenant_id: TenantShardId, + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress)> { + measured_request!( + "tenant_secondary_download", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.tenant_secondary_download(tenant_id, wait).await + ) + } + + pub(crate) async fn tenant_secondary_status( + &self, + tenant_shard_id: TenantShardId, + ) -> Result { + measured_request!( + "tenant_secondary_status", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.tenant_secondary_status(tenant_shard_id).await + ) + } + + pub(crate) async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> { + measured_request!( + "tenant_heatmap_upload", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.tenant_heatmap_upload(tenant_id).await + ) + } + + pub(crate) async fn location_config( + &self, + tenant_shard_id: TenantShardId, + config: LocationConfig, + flush_ms: Option, + lazy: bool, + ) -> Result<()> { + measured_request!( + "location_config", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .location_config(tenant_shard_id, config, flush_ms, lazy) + .await + ) + } + + pub(crate) async fn list_location_config(&self) -> Result { + measured_request!( + "location_configs", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.list_location_config().await + ) + } + + pub(crate) async fn get_location_config( + &self, + tenant_shard_id: TenantShardId, + ) -> Result> { + measured_request!( + "location_config", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.get_location_config(tenant_shard_id).await + ) + } + + pub(crate) async fn timeline_create( + &self, + tenant_shard_id: TenantShardId, + req: &TimelineCreateRequest, + ) -> Result { + measured_request!( + "timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.timeline_create(tenant_shard_id, req).await + ) + } + + pub(crate) async fn timeline_delete( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + ) -> Result { + measured_request!( + "timeline", + crate::metrics::Method::Delete, + &self.node_id_label, + self.inner + .timeline_delete(tenant_shard_id, timeline_id) + .await + ) + } + + pub(crate) async fn tenant_shard_split( + &self, + tenant_shard_id: TenantShardId, + req: TenantShardSplitRequest, + ) -> Result { + measured_request!( + "tenant_shard_split", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner.tenant_shard_split(tenant_shard_id, req).await + ) + } + + pub(crate) async fn timeline_list( + &self, + tenant_shard_id: &TenantShardId, + ) -> Result> { + measured_request!( + "timelines", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.timeline_list(tenant_shard_id).await + ) + } + + pub(crate) async fn get_utilization(&self) -> Result { + measured_request!( + "utilization", + crate::metrics::Method::Get, + &self.node_id_label, + self.inner.get_utilization().await + ) + } + + pub(crate) async fn top_tenant_shards( + &self, + request: TopTenantShardsRequest, + ) -> Result { + measured_request!( + "top_tenants", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.top_tenant_shards(request).await + ) + } +} diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs new file mode 100644 index 0000000000..47caf7ae81 --- /dev/null +++ b/storage_controller/src/persistence.rs @@ -0,0 +1,827 @@ +pub(crate) mod split_state; +use std::collections::HashMap; +use std::str::FromStr; +use std::time::Duration; +use std::time::Instant; + +use self::split_state::SplitState; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use diesel::pg::PgConnection; +use diesel::prelude::*; +use diesel::Connection; +use pageserver_api::controller_api::ShardSchedulingPolicy; +use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; +use pageserver_api::models::TenantConfig; +use pageserver_api::shard::ShardConfigError; +use pageserver_api::shard::ShardIdentity; +use pageserver_api::shard::ShardStripeSize; +use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId}; +use serde::{Deserialize, Serialize}; +use utils::generation::Generation; +use utils::id::{NodeId, TenantId}; + +use crate::metrics::{ + DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY, +}; +use crate::node::Node; + +/// ## What do we store? +/// +/// The storage controller service does not store most of its state durably. +/// +/// The essential things to store durably are: +/// - generation numbers, as these must always advance monotonically to ensure data safety. +/// - Tenant's PlacementPolicy and TenantConfig, as the source of truth for these is something external. +/// - Node's scheduling policies, as the source of truth for these is something external. +/// +/// Other things we store durably as an implementation detail: +/// - Node's host/port: this could be avoided it we made nodes emit a self-registering heartbeat, +/// but it is operationally simpler to make this service the authority for which nodes +/// it talks to. +/// +/// ## Performance/efficiency +/// +/// The storage controller service does not go via the database for most things: there are +/// a couple of places where we must, and where efficiency matters: +/// - Incrementing generation numbers: the Reconciler has to wait for this to complete +/// before it can attach a tenant, so this acts as a bound on how fast things like +/// failover can happen. +/// - Pageserver re-attach: we will increment many shards' generations when this happens, +/// so it is important to avoid e.g. issuing O(N) queries. +/// +/// Database calls relating to nodes have low performance requirements, as they are very rarely +/// updated, and reads of nodes are always from memory, not the database. We only require that +/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline. +pub struct Persistence { + connection_pool: diesel::r2d2::Pool>, + + // In test environments, we support loading+saving a JSON file. This is temporary, for the benefit of + // test_compatibility.py, so that we don't have to commit to making the database contents fully backward/forward + // compatible just yet. + json_path: Option, +} + +/// Legacy format, for use in JSON compat objects in test environment +#[derive(Serialize, Deserialize)] +struct JsonPersistence { + tenants: HashMap, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum DatabaseError { + #[error(transparent)] + Query(#[from] diesel::result::Error), + #[error(transparent)] + Connection(#[from] diesel::result::ConnectionError), + #[error(transparent)] + ConnectionPool(#[from] r2d2::Error), + #[error("Logical error: {0}")] + Logical(String), +} + +#[derive(measured::FixedCardinalityLabel, Copy, Clone)] +pub(crate) enum DatabaseOperation { + InsertNode, + UpdateNode, + DeleteNode, + ListNodes, + BeginShardSplit, + CompleteShardSplit, + AbortShardSplit, + Detach, + ReAttach, + IncrementGeneration, + ListTenantShards, + InsertTenantShards, + UpdateTenantShard, + DeleteTenant, + UpdateTenantConfig, +} + +#[must_use] +pub(crate) enum AbortShardSplitStatus { + /// We aborted the split in the database by reverting to the parent shards + Aborted, + /// The split had already been persisted. + Complete, +} + +pub(crate) type DatabaseResult = Result; + +/// Some methods can operate on either a whole tenant or a single shard +pub(crate) enum TenantFilter { + Tenant(TenantId), + Shard(TenantShardId), +} + +impl Persistence { + // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under + // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. + pub const MAX_CONNECTIONS: u32 = 99; + + // We don't want to keep a lot of connections alive: close them down promptly if they aren't being used. + const IDLE_CONNECTION_TIMEOUT: Duration = Duration::from_secs(10); + const MAX_CONNECTION_LIFETIME: Duration = Duration::from_secs(60); + + pub fn new(database_url: String, json_path: Option) -> Self { + let manager = diesel::r2d2::ConnectionManager::::new(database_url); + + // We will use a connection pool: this is primarily to _limit_ our connection count, rather than to optimize time + // to execute queries (database queries are not generally on latency-sensitive paths). + let connection_pool = diesel::r2d2::Pool::builder() + .max_size(Self::MAX_CONNECTIONS) + .max_lifetime(Some(Self::MAX_CONNECTION_LIFETIME)) + .idle_timeout(Some(Self::IDLE_CONNECTION_TIMEOUT)) + // Always keep at least one connection ready to go + .min_idle(Some(1)) + .test_on_check_out(true) + .build(manager) + .expect("Could not build connection pool"); + + Self { + connection_pool, + json_path, + } + } + + /// A helper for use during startup, where we would like to tolerate concurrent restarts of the + /// database and the storage controller, therefore the database might not be available right away + pub async fn await_connection( + database_url: &str, + timeout: Duration, + ) -> Result<(), diesel::ConnectionError> { + let started_at = Instant::now(); + loop { + match PgConnection::establish(database_url) { + Ok(_) => { + tracing::info!("Connected to database."); + return Ok(()); + } + Err(e) => { + if started_at.elapsed() > timeout { + return Err(e); + } else { + tracing::info!("Database not yet available, waiting... ({e})"); + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + } + } + } + + /// Wraps `with_conn` in order to collect latency and error metrics + async fn with_measured_conn(&self, op: DatabaseOperation, func: F) -> DatabaseResult + where + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + R: Send + 'static, + { + let latency = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_latency; + let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op }); + + let res = self.with_conn(func).await; + + if let Err(err) = &res { + let error_counter = &METRICS_REGISTRY + .metrics_group + .storage_controller_database_query_error; + error_counter.inc(DatabaseQueryErrorLabelGroup { + error_type: err.error_label(), + operation: op, + }) + } + + res + } + + /// Call the provided function in a tokio blocking thread, with a Diesel database connection. + async fn with_conn(&self, func: F) -> DatabaseResult + where + F: Fn(&mut PgConnection) -> DatabaseResult + Send + 'static, + R: Send + 'static, + { + // A generous allowance for how many times we may retry serializable transactions + // before giving up. This is not expected to be hit: it is a defensive measure in case we + // somehow engineer a situation where duelling transactions might otherwise live-lock. + const MAX_RETRIES: usize = 128; + + let mut conn = self.connection_pool.get()?; + tokio::task::spawn_blocking(move || -> DatabaseResult { + let mut retry_count = 0; + loop { + match conn.build_transaction().serializable().run(|c| func(c)) { + Ok(r) => break Ok(r), + Err( + err @ DatabaseError::Query(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::SerializationFailure, + _, + )), + ) => { + retry_count += 1; + if retry_count > MAX_RETRIES { + tracing::error!( + "Exceeded max retries on SerializationFailure errors: {err:?}" + ); + break Err(err); + } else { + // Retry on serialization errors: these are expected, because even though our + // transactions don't fight for the same rows, they will occasionally collide + // on index pages (e.g. increment_generation for unrelated shards can collide) + tracing::debug!( + "Retrying transaction on serialization failure {err:?}" + ); + continue; + } + } + Err(e) => break Err(e), + } + } + }) + .await + .expect("Task panic") + } + + /// When a node is first registered, persist it before using it for anything + pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> { + let np = node.to_persistent(); + self.with_measured_conn( + DatabaseOperation::InsertNode, + move |conn| -> DatabaseResult<()> { + diesel::insert_into(crate::schema::nodes::table) + .values(&np) + .execute(conn)?; + Ok(()) + }, + ) + .await + } + + /// At startup, populate the list of nodes which our shards may be placed on + pub(crate) async fn list_nodes(&self) -> DatabaseResult> { + let nodes: Vec = self + .with_measured_conn( + DatabaseOperation::ListNodes, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::nodes::table.load::(conn)?) + }, + ) + .await?; + + tracing::info!("list_nodes: loaded {} nodes", nodes.len()); + + Ok(nodes) + } + + pub(crate) async fn update_node( + &self, + input_node_id: NodeId, + input_scheduling: NodeSchedulingPolicy, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + let updated = self + .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| { + let updated = diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .set((scheduling_policy.eq(String::from(input_scheduling)),)) + .execute(conn)?; + Ok(updated) + }) + .await?; + + if updated != 1 { + Err(DatabaseError::Logical(format!( + "Node {node_id:?} not found for update", + ))) + } else { + Ok(()) + } + } + + /// At startup, load the high level state for shards, such as their config + policy. This will + /// be enriched at runtime with state discovered on pageservers. + pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { + let loaded = self + .with_measured_conn( + DatabaseOperation::ListTenantShards, + move |conn| -> DatabaseResult<_> { + Ok(crate::schema::tenant_shards::table.load::(conn)?) + }, + ) + .await?; + + if loaded.is_empty() { + if let Some(path) = &self.json_path { + if tokio::fs::try_exists(path) + .await + .map_err(|e| DatabaseError::Logical(format!("Error stat'ing JSON file: {e}")))? + { + tracing::info!("Importing from legacy JSON format at {path}"); + return self.list_tenant_shards_json(path).await; + } + } + } + Ok(loaded) + } + + /// Shim for automated compatibility tests: load tenants from a JSON file instead of database + pub(crate) async fn list_tenant_shards_json( + &self, + path: &Utf8Path, + ) -> DatabaseResult> { + let bytes = tokio::fs::read(path) + .await + .map_err(|e| DatabaseError::Logical(format!("Failed to load JSON: {e}")))?; + + let mut decoded = serde_json::from_slice::(&bytes) + .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?; + for shard in decoded.tenants.values_mut() { + if shard.placement_policy == "\"Single\"" { + // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165 + shard.placement_policy = "{\"Attached\":0}".to_string(); + } + + if shard.scheduling_policy.is_empty() { + shard.scheduling_policy = + serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap(); + } + } + + let tenants: Vec = decoded.tenants.into_values().collect(); + + // Synchronize database with what is in the JSON file + self.insert_tenant_shards(tenants.clone()).await?; + + Ok(tenants) + } + + /// For use in testing environments, where we dump out JSON on shutdown. + pub async fn write_tenants_json(&self) -> anyhow::Result<()> { + let Some(path) = &self.json_path else { + anyhow::bail!("Cannot write JSON if path isn't set (test environment bug)"); + }; + tracing::info!("Writing state to {path}..."); + let tenants = self.list_tenant_shards().await?; + let mut tenants_map = HashMap::new(); + for tsp in tenants { + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?, + shard_number: ShardNumber(tsp.shard_number as u8), + shard_count: ShardCount::new(tsp.shard_count as u8), + }; + + tenants_map.insert(tenant_shard_id, tsp); + } + let json = serde_json::to_string(&JsonPersistence { + tenants: tenants_map, + })?; + + tokio::fs::write(path, &json).await?; + tracing::info!("Wrote {} bytes to {path}...", json.len()); + + Ok(()) + } + + /// Tenants must be persisted before we schedule them for the first time. This enables us + /// to correctly retain generation monotonicity, and the externally provided placement policy & config. + pub(crate) async fn insert_tenant_shards( + &self, + shards: Vec, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn( + DatabaseOperation::InsertTenantShards, + move |conn| -> DatabaseResult<()> { + for tenant in &shards { + diesel::insert_into(tenant_shards) + .values(tenant) + .execute(conn)?; + } + Ok(()) + }, + ) + .await + } + + /// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for + /// the tenant from memory on this server. + pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn( + DatabaseOperation::DeleteTenant, + move |conn| -> DatabaseResult<()> { + diesel::delete(tenant_shards) + .filter(tenant_id.eq(del_tenant_id.to_string())) + .execute(conn)?; + + Ok(()) + }, + ) + .await + } + + pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + self.with_measured_conn( + DatabaseOperation::DeleteNode, + move |conn| -> DatabaseResult<()> { + diesel::delete(nodes) + .filter(node_id.eq(del_node_id.0 as i64)) + .execute(conn)?; + + Ok(()) + }, + ) + .await + } + + /// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient + /// batched increment of the generations of all tenants whose generation_pageserver is equal to + /// the node that called /re-attach. + #[tracing::instrument(skip_all, fields(node_id))] + pub(crate) async fn re_attach( + &self, + input_node_id: NodeId, + ) -> DatabaseResult> { + use crate::schema::nodes::dsl::scheduling_policy; + use crate::schema::nodes::dsl::*; + use crate::schema::tenant_shards::dsl::*; + let updated = self + .with_measured_conn(DatabaseOperation::ReAttach, move |conn| { + let rows_updated = diesel::update(tenant_shards) + .filter(generation_pageserver.eq(input_node_id.0 as i64)) + .set(generation.eq(generation + 1)) + .execute(conn)?; + + tracing::info!("Incremented {} tenants' generations", rows_updated); + + // TODO: UPDATE+SELECT in one query + + let updated = tenant_shards + .filter(generation_pageserver.eq(input_node_id.0 as i64)) + .select(TenantShardPersistence::as_select()) + .load(conn)?; + + // If the node went through a drain and restart phase before re-attaching, + // then reset it's node scheduling policy to active. + diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .filter( + scheduling_policy + .eq(String::from(NodeSchedulingPolicy::PauseForRestart)) + .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining))) + .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))), + ) + .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active))) + .execute(conn)?; + + Ok(updated) + }) + .await?; + + let mut result = HashMap::new(); + for tsp in updated { + let tenant_shard_id = TenantShardId { + tenant_id: TenantId::from_str(tsp.tenant_id.as_str()) + .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?, + shard_number: ShardNumber(tsp.shard_number as u8), + shard_count: ShardCount::new(tsp.shard_count as u8), + }; + + let Some(g) = tsp.generation else { + // If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL: + // we only set generation_pageserver when setting generation. + return Err(DatabaseError::Logical( + "Generation should always be set after incrementing".to_string(), + )); + }; + result.insert(tenant_shard_id, Generation::new(g as u32)); + } + + Ok(result) + } + + /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically + /// advancing generation number. We also store the NodeId for which the generation was issued, so that in + /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node. + pub(crate) async fn increment_generation( + &self, + tenant_shard_id: TenantShardId, + node_id: NodeId, + ) -> anyhow::Result { + use crate::schema::tenant_shards::dsl::*; + let updated = self + .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation.eq(generation + 1), + generation_pageserver.eq(node_id.0 as i64), + )) + // TODO: only returning() the generation column + .returning(TenantShardPersistence::as_returning()) + .get_result(conn)?; + + Ok(updated) + }) + .await?; + + // Generation is always non-null in the rseult: if the generation column had been NULL, then we + // should have experienced an SQL Confilict error while executing a query that tries to increment it. + debug_assert!(updated.generation.is_some()); + let Some(g) = updated.generation else { + return Err(DatabaseError::Logical( + "Generation should always be set after incrementing".to_string(), + ) + .into()); + }; + + Ok(Generation::new(g as u32)) + } + + /// For use when updating a persistent property of a tenant, such as its config or placement_policy. + /// + /// Do not use this for settting generation, unless in the special onboarding code path (/location_config) + /// API: use [`Self::increment_generation`] instead. Setting the generation via this route is a one-time thing + /// that we only do the first time a tenant is set to an attached policy via /location_config. + pub(crate) async fn update_tenant_shard( + &self, + tenant: TenantFilter, + input_placement_policy: Option, + input_config: Option, + input_generation: Option, + input_scheduling_policy: Option, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + + self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| { + let query = match tenant { + TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .into_boxed(), + TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards) + .filter(tenant_id.eq(input_tenant_id.to_string())) + .into_boxed(), + }; + + #[derive(AsChangeset)] + #[diesel(table_name = crate::schema::tenant_shards)] + struct ShardUpdate { + generation: Option, + placement_policy: Option, + config: Option, + scheduling_policy: Option, + } + + let update = ShardUpdate { + generation: input_generation.map(|g| g.into().unwrap() as i32), + placement_policy: input_placement_policy + .as_ref() + .map(|p| serde_json::to_string(&p).unwrap()), + config: input_config + .as_ref() + .map(|c| serde_json::to_string(&c).unwrap()), + scheduling_policy: input_scheduling_policy + .map(|p| serde_json::to_string(&p).unwrap()), + }; + + query.set(update).execute(conn)?; + + Ok(()) + }) + .await?; + + Ok(()) + } + + pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn(DatabaseOperation::Detach, move |conn| { + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32)) + .set(( + generation_pageserver.eq(Option::::None), + placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()), + )) + .execute(conn)?; + + Ok(updated) + }) + .await?; + + Ok(()) + } + + // When we start shard splitting, we must durably mark the tenant so that + // on restart, we know that we must go through recovery. + // + // We create the child shards here, so that they will be available for increment_generation calls + // if some pageserver holding a child shard needs to restart before the overall tenant split is complete. + pub(crate) async fn begin_shard_split( + &self, + old_shard_count: ShardCount, + split_tenant_id: TenantId, + parent_to_children: Vec<(TenantShardId, Vec)>, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> { + // Mark parent shards as splitting + + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .set((splitting.eq(1),)) + .execute(conn)?; + if u8::try_from(updated) + .map_err(|_| DatabaseError::Logical( + format!("Overflow existing shard count {} while splitting", updated)) + )? != old_shard_count.count() { + // Perhaps a deletion or another split raced with this attempt to split, mutating + // the parent shards that we intend to split. In this case the split request should fail. + return Err(DatabaseError::Logical( + format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count()) + )); + } + + // FIXME: spurious clone to sidestep closure move rules + let parent_to_children = parent_to_children.clone(); + + // Insert child shards + for (parent_shard_id, children) in parent_to_children { + let mut parent = crate::schema::tenant_shards::table + .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string())) + .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32)) + .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32)) + .load::(conn)?; + let parent = if parent.len() != 1 { + return Err(DatabaseError::Logical(format!( + "Parent shard {parent_shard_id} not found" + ))); + } else { + parent.pop().unwrap() + }; + for mut shard in children { + // Carry the parent's generation into the child + shard.generation = parent.generation; + + debug_assert!(shard.splitting == SplitState::Splitting); + diesel::insert_into(tenant_shards) + .values(shard) + .execute(conn)?; + } + } + + Ok(()) + }) + .await + } + + // When we finish shard splitting, we must atomically clean up the old shards + // and insert the new shards, and clear the splitting marker. + pub(crate) async fn complete_shard_split( + &self, + split_tenant_id: TenantId, + old_shard_count: ShardCount, + ) -> DatabaseResult<()> { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn( + DatabaseOperation::CompleteShardSplit, + move |conn| -> DatabaseResult<()> { + // Drop parent shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(old_shard_count.literal() as i32)) + .execute(conn)?; + + // Clear sharding flag + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .set((splitting.eq(0),)) + .execute(conn)?; + debug_assert!(updated > 0); + + Ok(()) + }, + ) + .await + } + + /// Used when the remote part of a shard split failed: we will revert the database state to have only + /// the parent shards, with SplitState::Idle. + pub(crate) async fn abort_shard_split( + &self, + split_tenant_id: TenantId, + new_shard_count: ShardCount, + ) -> DatabaseResult { + use crate::schema::tenant_shards::dsl::*; + self.with_measured_conn( + DatabaseOperation::AbortShardSplit, + move |conn| -> DatabaseResult { + // Clear the splitting state on parent shards + let updated = diesel::update(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.ne(new_shard_count.literal() as i32)) + .set((splitting.eq(0),)) + .execute(conn)?; + + // Parent shards are already gone: we cannot abort. + if updated == 0 { + return Ok(AbortShardSplitStatus::Complete); + } + + // Sanity check: if parent shards were present, their cardinality should + // be less than the number of child shards. + if updated >= new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected parent shard count {updated} while aborting split to \ + count {new_shard_count:?} on tenant {split_tenant_id}" + ))); + } + + // Erase child shards + diesel::delete(tenant_shards) + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)) + .execute(conn)?; + + Ok(AbortShardSplitStatus::Aborted) + }, + ) + .await + } +} + +/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably +#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] +#[diesel(table_name = crate::schema::tenant_shards)] +pub(crate) struct TenantShardPersistence { + #[serde(default)] + pub(crate) tenant_id: String, + #[serde(default)] + pub(crate) shard_number: i32, + #[serde(default)] + pub(crate) shard_count: i32, + #[serde(default)] + pub(crate) shard_stripe_size: i32, + + // Latest generation number: next time we attach, increment this + // and use the incremented number when attaching. + // + // Generation is only None when first onboarding a tenant, where it may + // be in PlacementPolicy::Secondary and therefore have no valid generation state. + pub(crate) generation: Option, + + // Currently attached pageserver + #[serde(rename = "pageserver")] + pub(crate) generation_pageserver: Option, + + #[serde(default)] + pub(crate) placement_policy: String, + #[serde(default)] + pub(crate) splitting: SplitState, + #[serde(default)] + pub(crate) config: String, + #[serde(default)] + pub(crate) scheduling_policy: String, +} + +impl TenantShardPersistence { + pub(crate) fn get_shard_identity(&self) -> Result { + if self.shard_count == 0 { + Ok(ShardIdentity::unsharded()) + } else { + Ok(ShardIdentity::new( + ShardNumber(self.shard_number as u8), + ShardCount::new(self.shard_count as u8), + ShardStripeSize(self.shard_stripe_size as u32), + )?) + } + } + + pub(crate) fn get_tenant_shard_id(&self) -> Result { + Ok(TenantShardId { + tenant_id: TenantId::from_str(self.tenant_id.as_str())?, + shard_number: ShardNumber(self.shard_number as u8), + shard_count: ShardCount::new(self.shard_count as u8), + }) + } +} + +/// Parts of [`crate::node::Node`] that are stored durably +#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)] +#[diesel(table_name = crate::schema::nodes)] +pub(crate) struct NodePersistence { + pub(crate) node_id: i64, + pub(crate) scheduling_policy: String, + pub(crate) listen_http_addr: String, + pub(crate) listen_http_port: i32, + pub(crate) listen_pg_addr: String, + pub(crate) listen_pg_port: i32, +} diff --git a/storage_controller/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs new file mode 100644 index 0000000000..bce1a75843 --- /dev/null +++ b/storage_controller/src/persistence/split_state.rs @@ -0,0 +1,46 @@ +use diesel::pg::{Pg, PgValue}; +use diesel::{ + deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql, + sql_types::Int2, +}; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)] +#[diesel(sql_type = SplitStateSQLRepr)] +#[derive(Deserialize, Serialize)] +pub enum SplitState { + Idle = 0, + Splitting = 1, +} + +impl Default for SplitState { + fn default() -> Self { + Self::Idle + } +} + +type SplitStateSQLRepr = Int2; + +impl ToSql for SplitState { + fn to_sql<'a>( + &'a self, + out: &'a mut diesel::serialize::Output, + ) -> diesel::serialize::Result { + let raw_value: i16 = *self as i16; + let mut new_out = out.reborrow(); + ToSql::::to_sql(&raw_value, &mut new_out) + } +} + +impl FromSql for SplitState { + fn from_sql(pg_value: PgValue) -> diesel::deserialize::Result { + match FromSql::::from_sql(pg_value).map(|v| match v { + 0 => Some(Self::Idle), + 1 => Some(Self::Splitting), + _ => None, + })? { + Some(v) => Ok(v), + None => Err(format!("Invalid SplitState value, was: {:?}", pg_value.as_bytes()).into()), + } + } +} diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs new file mode 100644 index 0000000000..fe97f724c1 --- /dev/null +++ b/storage_controller/src/reconciler.rs @@ -0,0 +1,830 @@ +use crate::pageserver_client::PageserverClient; +use crate::persistence::Persistence; +use crate::service; +use pageserver_api::models::{ + LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, +}; +use pageserver_api::shard::{ShardIdentity, TenantShardId}; +use pageserver_client::mgmt_api; +use reqwest::StatusCode; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use tokio_util::sync::CancellationToken; +use utils::generation::Generation; +use utils::id::{NodeId, TimelineId}; +use utils::lsn::Lsn; +use utils::sync::gate::GateGuard; + +use crate::compute_hook::{ComputeHook, NotifyError}; +use crate::node::Node; +use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation}; + +const DEFAULT_HEATMAP_PERIOD: &str = "60s"; + +/// Object with the lifetime of the background reconcile task that is created +/// for tenants which have a difference between their intent and observed states. +pub(super) struct Reconciler { + /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot + /// of a tenant's state from when we spawned a reconcile task. + pub(super) tenant_shard_id: TenantShardId, + pub(crate) shard: ShardIdentity, + pub(crate) generation: Option, + pub(crate) intent: TargetState, + + /// Nodes not referenced by [`Self::intent`], from which we should try + /// to detach this tenant shard. + pub(crate) detach: Vec, + + pub(crate) config: TenantConfig, + pub(crate) observed: ObservedState, + + pub(crate) service_config: service::Config, + + /// A hook to notify the running postgres instances when we change the location + /// of a tenant. Use this via [`Self::compute_notify`] to update our failure flag + /// and guarantee eventual retries. + pub(crate) compute_hook: Arc, + + /// To avoid stalling if the cloud control plane is unavailable, we may proceed + /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed + /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. + pub(crate) compute_notify_failure: bool, + + /// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many + /// we will spawn. + pub(crate) _resource_units: ReconcileUnits, + + /// A means to abort background reconciliation: it is essential to + /// call this when something changes in the original TenantShard that + /// will make this reconciliation impossible or unnecessary, for + /// example when a pageserver node goes offline, or the PlacementPolicy for + /// the tenant is changed. + pub(crate) cancel: CancellationToken, + + /// Reconcilers are registered with a Gate so that during a graceful shutdown we + /// can wait for all the reconcilers to respond to their cancellation tokens. + pub(crate) _gate_guard: GateGuard, + + /// Access to persistent storage for updating generation numbers + pub(crate) persistence: Arc, +} + +/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O +pub(crate) struct ReconcileUnits { + _sem_units: tokio::sync::OwnedSemaphorePermit, +} + +impl ReconcileUnits { + pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self { + Self { + _sem_units: sem_units, + } + } +} + +/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any +/// reference counting for Scheduler. The IntentState is what the scheduler works with, +/// and the TargetState is just the instruction for a particular Reconciler run. +#[derive(Debug)] +pub(crate) struct TargetState { + pub(crate) attached: Option, + pub(crate) secondary: Vec, +} + +impl TargetState { + pub(crate) fn from_intent(nodes: &HashMap, intent: &IntentState) -> Self { + Self { + attached: intent.get_attached().map(|n| { + nodes + .get(&n) + .expect("Intent attached referenced non-existent node") + .clone() + }), + secondary: intent + .get_secondary() + .iter() + .map(|n| { + nodes + .get(n) + .expect("Intent secondary referenced non-existent node") + .clone() + }) + .collect(), + } + } +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum ReconcileError { + #[error(transparent)] + Remote(#[from] mgmt_api::Error), + #[error(transparent)] + Notify(#[from] NotifyError), + #[error("Cancelled")] + Cancel, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl Reconciler { + async fn location_config( + &mut self, + node: &Node, + config: LocationConfig, + flush_ms: Option, + lazy: bool, + ) -> Result<(), ReconcileError> { + if !node.is_available() && config.mode == LocationConfigMode::Detached { + // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline + // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of + // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`] + tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation"); + self.observed.locations.remove(&node.get_id()); + return Ok(()); + } + + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: None }); + + // TODO: amend locations that use long-polling: they will hit this timeout. + let timeout = Duration::from_secs(25); + + tracing::info!("location_config({node}) calling: {:?}", config); + let tenant_shard_id = self.tenant_shard_id; + let config_ref = &config; + match node + .with_client_retries( + |client| async move { + let config = config_ref.clone(); + client + .location_config(tenant_shard_id, config.clone(), flush_ms, lazy) + .await + }, + &self.service_config.jwt_token, + 1, + 3, + timeout, + &self.cancel, + ) + .await + { + Some(Ok(_)) => {} + Some(Err(e)) => return Err(e.into()), + None => return Err(ReconcileError::Cancel), + }; + tracing::info!("location_config({node}) complete: {:?}", config); + + match config.mode { + LocationConfigMode::Detached => { + self.observed.locations.remove(&node.get_id()); + } + _ => { + self.observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: Some(config) }); + } + } + + Ok(()) + } + + fn get_node(&self, node_id: &NodeId) -> Option<&Node> { + if let Some(node) = self.intent.attached.as_ref() { + if node.get_id() == *node_id { + return Some(node); + } + } + + if let Some(node) = self + .intent + .secondary + .iter() + .find(|n| n.get_id() == *node_id) + { + return Some(node); + } + + if let Some(node) = self.detach.iter().find(|n| n.get_id() == *node_id) { + return Some(node); + } + + None + } + + async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> { + let destination = if let Some(node) = &self.intent.attached { + match self.observed.locations.get(&node.get_id()) { + Some(conf) => { + // We will do a live migration only if the intended destination is not + // currently in an attached state. + match &conf.conf { + Some(conf) if conf.mode == LocationConfigMode::Secondary => { + // Fall through to do a live migration + node + } + None | Some(_) => { + // Attached or uncertain: don't do a live migration, proceed + // with a general-case reconciliation + tracing::info!("maybe_live_migrate: destination is None or attached"); + return Ok(()); + } + } + } + None => { + // Our destination is not attached: maybe live migrate if some other + // node is currently attached. Fall through. + node + } + } + } else { + // No intent to be attached + tracing::info!("maybe_live_migrate: no attached intent"); + return Ok(()); + }; + + let mut origin = None; + for (node_id, state) in &self.observed.locations { + if let Some(observed_conf) = &state.conf { + if observed_conf.mode == LocationConfigMode::AttachedSingle { + // We will only attempt live migration if the origin is not offline: this + // avoids trying to do it while reconciling after responding to an HA failover. + if let Some(node) = self.get_node(node_id) { + if node.is_available() { + origin = Some(node.clone()); + break; + } + } + } + } + } + + let Some(origin) = origin else { + tracing::info!("maybe_live_migrate: no origin found"); + return Ok(()); + }; + + // We have an origin and a destination: proceed to do the live migration + tracing::info!("Live migrating {}->{}", origin, destination); + self.live_migrate(origin, destination.clone()).await?; + + Ok(()) + } + + async fn get_lsns( + &self, + tenant_shard_id: TenantShardId, + node: &Node, + ) -> anyhow::Result> { + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.service_config.jwt_token.as_deref(), + ); + + let timelines = client.timeline_list(&tenant_shard_id).await?; + Ok(timelines + .into_iter() + .map(|t| (t.timeline_id, t.last_record_lsn)) + .collect()) + } + + async fn secondary_download( + &self, + tenant_shard_id: TenantShardId, + node: &Node, + ) -> Result<(), ReconcileError> { + // This is not the timeout for a request, but the total amount of time we're willing to wait + // for a secondary location to get up to date before + const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300); + + // This the long-polling interval for the secondary download requests we send to destination pageserver + // during a migration. + const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20); + + let started_at = Instant::now(); + + loop { + let (status, progress) = match node + .with_client_retries( + |client| async move { + client + .tenant_secondary_download( + tenant_shard_id, + Some(REQUEST_DOWNLOAD_TIMEOUT), + ) + .await + }, + &self.service_config.jwt_token, + 1, + 3, + REQUEST_DOWNLOAD_TIMEOUT * 2, + &self.cancel, + ) + .await + { + None => Err(ReconcileError::Cancel), + Some(Ok(v)) => Ok(v), + Some(Err(e)) => { + // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before + // attaching, but we should not let an issue with a secondary location stop us proceeding + // with a live migration. + tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})"); + return Ok(()); + } + }?; + + if status == StatusCode::OK { + tracing::info!( + "Downloads to {} complete: {}/{} layers, {}/{} bytes", + node, + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + return Ok(()); + } else if status == StatusCode::ACCEPTED { + let total_runtime = started_at.elapsed(); + if total_runtime > TOTAL_DOWNLOAD_TIMEOUT { + tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes", + total_runtime.as_millis(), + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working, + // it just makes the I/O performance for users less good. + return Ok(()); + } + + // Log and proceed around the loop to retry. We don't sleep between requests, because our HTTP call + // to the pageserver is a long-poll. + tracing::info!( + "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes", + node, + progress.layers_downloaded, + progress.layers_total, + progress.bytes_downloaded, + progress.bytes_total + ); + } + } + } + + async fn await_lsn( + &self, + tenant_shard_id: TenantShardId, + node: &Node, + baseline: HashMap, + ) -> anyhow::Result<()> { + loop { + let latest = match self.get_lsns(tenant_shard_id, node).await { + Ok(l) => l, + Err(e) => { + tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",); + std::thread::sleep(Duration::from_millis(500)); + continue; + } + }; + + let mut any_behind: bool = false; + for (timeline_id, baseline_lsn) in &baseline { + match latest.get(timeline_id) { + Some(latest_lsn) => { + tracing::info!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}"); + if latest_lsn < baseline_lsn { + any_behind = true; + } + } + None => { + // Expected timeline isn't yet visible on migration destination. + // (IRL we would have to account for timeline deletion, but this + // is just test helper) + any_behind = true; + } + } + } + + if !any_behind { + tracing::info!("✅ LSN caught up. Proceeding..."); + break; + } else { + std::thread::sleep(Duration::from_millis(500)); + } + } + + Ok(()) + } + + pub async fn live_migrate( + &mut self, + origin_ps: Node, + dest_ps: Node, + ) -> Result<(), ReconcileError> { + // `maybe_live_migrate` is responsibble for sanity of inputs + assert!(origin_ps.get_id() != dest_ps.get_id()); + + fn build_location_config( + shard: &ShardIdentity, + config: &TenantConfig, + mode: LocationConfigMode, + generation: Option, + secondary_conf: Option, + ) -> LocationConfig { + LocationConfig { + mode, + generation: generation.map(|g| g.into().unwrap()), + secondary_conf, + tenant_conf: config.clone(), + shard_number: shard.number.0, + shard_count: shard.count.literal(), + shard_stripe_size: shard.stripe_size.0, + } + } + + tracing::info!("🔁 Switching origin node {origin_ps} to stale mode",); + + // FIXME: it is incorrect to use self.generation here, we should use the generation + // from the ObservedState of the origin pageserver (it might be older than self.generation) + let stale_conf = build_location_config( + &self.shard, + &self.config, + LocationConfigMode::AttachedStale, + self.generation, + None, + ); + self.location_config(&origin_ps, stale_conf, Some(Duration::from_secs(10)), false) + .await?; + + let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps).await?); + + // If we are migrating to a destination that has a secondary location, warm it up first + if let Some(destination_conf) = self.observed.locations.get(&dest_ps.get_id()) { + if let Some(destination_conf) = &destination_conf.conf { + if destination_conf.mode == LocationConfigMode::Secondary { + tracing::info!("🔁 Downloading latest layers to destination node {dest_ps}",); + self.secondary_download(self.tenant_shard_id, &dest_ps) + .await?; + } + } + } + + // Increment generation before attaching to new pageserver + self.generation = Some( + self.persistence + .increment_generation(self.tenant_shard_id, dest_ps.get_id()) + .await?, + ); + + let dest_conf = build_location_config( + &self.shard, + &self.config, + LocationConfigMode::AttachedMulti, + self.generation, + None, + ); + + tracing::info!("🔁 Attaching to pageserver {dest_ps}"); + self.location_config(&dest_ps, dest_conf, None, false) + .await?; + + if let Some(baseline) = baseline_lsns { + tracing::info!("🕑 Waiting for LSN to catch up..."); + self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) + .await?; + } + + tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}"); + + // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach + // the origin without notifying compute, we will render the tenant unavailable. + while let Err(e) = self.compute_notify().await { + match e { + NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), + NotifyError::ShuttingDown => return Err(ReconcileError::Cancel), + _ => { + tracing::warn!( + "Live migration blocked by compute notification error, retrying: {e}" + ); + } + } + } + + // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then + // this location will be deleted in the general case reconciliation that runs after this. + let origin_secondary_conf = build_location_config( + &self.shard, + &self.config, + LocationConfigMode::Secondary, + None, + Some(LocationConfigSecondary { warm: true }), + ); + self.location_config(&origin_ps, origin_secondary_conf.clone(), None, false) + .await?; + // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail + // partway through. In fact, all location conf API calls should be in a wrapper that sets + // the observed state to None, then runs, then sets it to what we wrote. + self.observed.locations.insert( + origin_ps.get_id(), + ObservedStateLocation { + conf: Some(origin_secondary_conf), + }, + ); + + tracing::info!("🔁 Switching to AttachedSingle mode on node {dest_ps}",); + let dest_final_conf = build_location_config( + &self.shard, + &self.config, + LocationConfigMode::AttachedSingle, + self.generation, + None, + ); + self.location_config(&dest_ps, dest_final_conf.clone(), None, false) + .await?; + self.observed.locations.insert( + dest_ps.get_id(), + ObservedStateLocation { + conf: Some(dest_final_conf), + }, + ); + + tracing::info!("✅ Migration complete"); + + Ok(()) + } + + async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> { + // If the attached node has uncertain state, read it from the pageserver before proceeding: this + // is important to avoid spurious generation increments. + // + // We don't need to do this for secondary/detach locations because it's harmless to just PUT their + // location conf, whereas for attached locations it can interrupt clients if we spuriously destroy/recreate + // the `Timeline` object in the pageserver. + + let Some(attached_node) = self.intent.attached.as_ref() else { + // Nothing to do + return Ok(()); + }; + + if matches!( + self.observed.locations.get(&attached_node.get_id()), + Some(ObservedStateLocation { conf: None }) + ) { + let tenant_shard_id = self.tenant_shard_id; + let observed_conf = match attached_node + .with_client_retries( + |client| async move { client.get_location_config(tenant_shard_id).await }, + &self.service_config.jwt_token, + 1, + 1, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(observed)) => Some(observed), + Some(Err(mgmt_api::Error::ApiError(status, _msg))) + if status == StatusCode::NOT_FOUND => + { + None + } + Some(Err(e)) => return Err(e.into()), + None => return Err(ReconcileError::Cancel), + }; + tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}"); + match observed_conf { + Some(conf) => { + // Pageserver returned a state: update it in observed. This may still be an indeterminate (None) state, + // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running) + self.observed + .locations + .insert(attached_node.get_id(), ObservedStateLocation { conf }); + } + None => { + // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver. + self.observed.locations.remove(&attached_node.get_id()); + } + } + } + + Ok(()) + } + + /// Reconciling a tenant makes API calls to pageservers until the observed state + /// matches the intended state. + /// + /// First we apply special case handling (e.g. for live migrations), and then a + /// general case reconciliation where we walk through the intent by pageserver + /// and call out to the pageserver to apply the desired state. + pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> { + // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it + self.maybe_refresh_observed().await?; + + // Special case: live migration + self.maybe_live_migrate().await?; + + // If the attached pageserver is not attached, do so now. + if let Some(node) = self.intent.attached.as_ref() { + // If we are in an attached policy, then generation must have been set (null generations + // are only present when a tenant is initially loaded with a secondary policy) + debug_assert!(self.generation.is_some()); + let Some(generation) = self.generation else { + return Err(ReconcileError::Other(anyhow::anyhow!( + "Attempted to attach with NULL generation" + ))); + }; + + let mut wanted_conf = attached_location_conf( + generation, + &self.shard, + &self.config, + !self.intent.secondary.is_empty(), + ); + match self.observed.locations.get(&node.get_id()) { + Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { + // Nothing to do + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") + } + observed => { + // In all cases other than a matching observed configuration, we will + // reconcile this location. This includes locations with different configurations, as well + // as locations with unknown (None) observed state. + + // The general case is to increment the generation. However, there are cases + // where this is not necessary: + // - if we are only updating the TenantConf part of the location + // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale) + // and the location was already in the correct generation + let increment_generation = match observed { + None => true, + Some(ObservedStateLocation { conf: None }) => true, + Some(ObservedStateLocation { + conf: Some(observed), + }) => { + let generations_match = observed.generation == wanted_conf.generation; + + use LocationConfigMode::*; + let mode_transition_requires_gen_inc = + match (observed.mode, wanted_conf.mode) { + // Usually the short-lived attachment modes (multi and stale) are only used + // in the case of [`Self::live_migrate`], but it is simple to handle them correctly + // here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation. + (AttachedSingle, AttachedStale) => false, + (AttachedMulti, AttachedSingle) => false, + (lhs, rhs) => lhs != rhs, + }; + + !generations_match || mode_transition_requires_gen_inc + } + }; + + if increment_generation { + let generation = self + .persistence + .increment_generation(self.tenant_shard_id, node.get_id()) + .await?; + self.generation = Some(generation); + wanted_conf.generation = generation.into(); + } + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + + // Because `node` comes from a ref to &self, clone it before calling into a &mut self + // function: this could be avoided by refactoring the state mutated by location_config into + // a separate type to Self. + let node = node.clone(); + + // Use lazy=true, because we may run many of Self concurrently, and do not want to + // overload the pageserver with logical size calculations. + self.location_config(&node, wanted_conf, None, true).await?; + self.compute_notify().await?; + } + } + } + + // Configure secondary locations: if these were previously attached this + // implicitly downgrades them from attached to secondary. + let mut changes = Vec::new(); + for node in &self.intent.secondary { + let wanted_conf = secondary_location_conf(&self.shard, &self.config); + match self.observed.locations.get(&node.get_id()) { + Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { + // Nothing to do + tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.") + } + _ => { + // In all cases other than a matching observed configuration, we will + // reconcile this location. + tracing::info!(node_id=%node.get_id(), "Observed configuration requires update."); + changes.push((node.clone(), wanted_conf)) + } + } + } + + // Detach any extraneous pageservers that are no longer referenced + // by our intent. + for node in &self.detach { + changes.push(( + node.clone(), + LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: self.shard.number.0, + shard_count: self.shard.count.literal(), + shard_stripe_size: self.shard.stripe_size.0, + tenant_conf: self.config.clone(), + }, + )); + } + + for (node, conf) in changes { + if self.cancel.is_cancelled() { + return Err(ReconcileError::Cancel); + } + self.location_config(&node, conf, None, false).await?; + } + + Ok(()) + } + + pub(crate) async fn compute_notify(&mut self) -> Result<(), NotifyError> { + // Whenever a particular Reconciler emits a notification, it is always notifying for the intended + // destination. + if let Some(node) = &self.intent.attached { + let result = self + .compute_hook + .notify( + self.tenant_shard_id, + node.get_id(), + self.shard.stripe_size, + &self.cancel, + ) + .await; + if let Err(e) = &result { + // It is up to the caller whether they want to drop out on this error, but they don't have to: + // in general we should avoid letting unavailability of the cloud control plane stop us from + // making progress. + if !matches!(e, NotifyError::ShuttingDown) { + tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}"); + } + + // Set this flag so that in our ReconcileResult we will set the flag on the shard that it + // needs to retry at some point. + self.compute_notify_failure = true; + } + result + } else { + Ok(()) + } + } +} + +/// We tweak the externally-set TenantConfig while configuring +/// locations, using our awareness of whether secondary locations +/// are in use to automatically enable/disable heatmap uploads. +fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig { + let mut config = config.clone(); + if has_secondaries { + if config.heatmap_period.is_none() { + config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string()); + } + } else { + config.heatmap_period = None; + } + config +} + +pub(crate) fn attached_location_conf( + generation: Generation, + shard: &ShardIdentity, + config: &TenantConfig, + has_secondaries: bool, +) -> LocationConfig { + LocationConfig { + mode: LocationConfigMode::AttachedSingle, + generation: generation.into(), + secondary_conf: None, + shard_number: shard.number.0, + shard_count: shard.count.literal(), + shard_stripe_size: shard.stripe_size.0, + tenant_conf: ha_aware_config(config, has_secondaries), + } +} + +pub(crate) fn secondary_location_conf( + shard: &ShardIdentity, + config: &TenantConfig, +) -> LocationConfig { + LocationConfig { + mode: LocationConfigMode::Secondary, + generation: None, + secondary_conf: Some(LocationConfigSecondary { warm: true }), + shard_number: shard.number.0, + shard_count: shard.count.literal(), + shard_stripe_size: shard.stripe_size.0, + tenant_conf: ha_aware_config(config, true), + } +} diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs new file mode 100644 index 0000000000..0bd2eeac35 --- /dev/null +++ b/storage_controller/src/scheduler.rs @@ -0,0 +1,556 @@ +use crate::{node::Node, tenant_shard::TenantShard}; +use itertools::Itertools; +use pageserver_api::controller_api::UtilizationScore; +use serde::Serialize; +use std::collections::HashMap; +use utils::{http::error::ApiError, id::NodeId}; + +/// Scenarios in which we cannot find a suitable location for a tenant shard +#[derive(thiserror::Error, Debug)] +pub enum ScheduleError { + #[error("No pageservers found")] + NoPageservers, + #[error("No pageserver found matching constraint")] + ImpossibleConstraint, +} + +impl From for ApiError { + fn from(value: ScheduleError) -> Self { + ApiError::Conflict(format!("Scheduling error: {}", value)) + } +} + +#[derive(Serialize, Eq, PartialEq)] +pub enum MaySchedule { + Yes(UtilizationScore), + No, +} + +#[derive(Serialize)] +struct SchedulerNode { + /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`]. + shard_count: usize, + /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`]. + attached_shard_count: usize, + + /// Whether this node is currently elegible to have new shards scheduled (this is derived + /// from a node's availability state and scheduling policy). + may_schedule: MaySchedule, +} + +impl PartialEq for SchedulerNode { + fn eq(&self, other: &Self) -> bool { + let may_schedule_matches = matches!( + (&self.may_schedule, &other.may_schedule), + (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No) + ); + + may_schedule_matches + && self.shard_count == other.shard_count + && self.attached_shard_count == other.attached_shard_count + } +} + +impl Eq for SchedulerNode {} + +/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver +/// on which to run. +/// +/// The type has no persistent state of its own: this is all populated at startup. The Serialize +/// impl is only for debug dumps. +#[derive(Serialize)] +pub(crate) struct Scheduler { + nodes: HashMap, +} + +/// Score for soft constraint scheduling: lower scores are preferred to higher scores. +/// +/// For example, we may set an affinity score based on the number of shards from the same +/// tenant already on a node, to implicitly prefer to balance out shards. +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] +pub(crate) struct AffinityScore(pub(crate) usize); + +impl AffinityScore { + /// If we have no anti-affinity at all toward a node, this is its score. It means + /// the scheduler has a free choice amongst nodes with this score, and may pick a node + /// based on other information such as total utilization. + pub(crate) const FREE: Self = Self(0); + + pub(crate) fn inc(&mut self) { + self.0 += 1; + } +} + +impl std::ops::Add for AffinityScore { + type Output = Self; + + fn add(self, rhs: Self) -> Self::Output { + Self(self.0 + rhs.0) + } +} + +/// Hint for whether this is a sincere attempt to schedule, or a speculative +/// check for where we _would_ schedule (done during optimization) +#[derive(Debug)] +pub(crate) enum ScheduleMode { + Normal, + Speculative, +} + +impl Default for ScheduleMode { + fn default() -> Self { + Self::Normal + } +} + +// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling +// it for many shards in the same tenant. +#[derive(Debug, Default)] +pub(crate) struct ScheduleContext { + /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`] + pub(crate) nodes: HashMap, + + /// Specifically how many _attached_ locations are on each node + pub(crate) attached_nodes: HashMap, + + pub(crate) mode: ScheduleMode, +} + +impl ScheduleContext { + /// Input is a list of nodes we would like to avoid using again within this context. The more + /// times a node is passed into this call, the less inclined we are to use it. + pub(crate) fn avoid(&mut self, nodes: &[NodeId]) { + for node_id in nodes { + let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE); + entry.inc() + } + } + + pub(crate) fn push_attached(&mut self, node_id: NodeId) { + let entry = self.attached_nodes.entry(node_id).or_default(); + *entry += 1; + } + + pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore { + self.nodes + .get(&node_id) + .copied() + .unwrap_or(AffinityScore::FREE) + } + + pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize { + self.attached_nodes.get(&node_id).copied().unwrap_or(0) + } +} + +pub(crate) enum RefCountUpdate { + PromoteSecondary, + Attach, + Detach, + DemoteAttached, + AddSecondary, + RemoveSecondary, +} + +impl Scheduler { + pub(crate) fn new<'a>(nodes: impl Iterator) -> Self { + let mut scheduler_nodes = HashMap::new(); + for node in nodes { + scheduler_nodes.insert( + node.get_id(), + SchedulerNode { + shard_count: 0, + attached_shard_count: 0, + may_schedule: node.may_schedule(), + }, + ); + } + + Self { + nodes: scheduler_nodes, + } + } + + /// For debug/support: check that our internal statistics are in sync with the state of + /// the nodes & tenant shards. + /// + /// If anything is inconsistent, log details and return an error. + pub(crate) fn consistency_check<'a>( + &self, + nodes: impl Iterator, + shards: impl Iterator, + ) -> anyhow::Result<()> { + let mut expect_nodes: HashMap = HashMap::new(); + for node in nodes { + expect_nodes.insert( + node.get_id(), + SchedulerNode { + shard_count: 0, + attached_shard_count: 0, + may_schedule: node.may_schedule(), + }, + ); + } + + for shard in shards { + if let Some(node_id) = shard.intent.get_attached() { + match expect_nodes.get_mut(node_id) { + Some(node) => { + node.shard_count += 1; + node.attached_shard_count += 1; + } + None => anyhow::bail!( + "Tenant {} references nonexistent node {}", + shard.tenant_shard_id, + node_id + ), + } + } + + for node_id in shard.intent.get_secondary() { + match expect_nodes.get_mut(node_id) { + Some(node) => node.shard_count += 1, + None => anyhow::bail!( + "Tenant {} references nonexistent node {}", + shard.tenant_shard_id, + node_id + ), + } + } + } + + for (node_id, expect_node) in &expect_nodes { + let Some(self_node) = self.nodes.get(node_id) else { + anyhow::bail!("Node {node_id} not found in Self") + }; + + if self_node != expect_node { + tracing::error!("Inconsistency detected in scheduling state for node {node_id}"); + tracing::error!("Expected state: {}", serde_json::to_string(expect_node)?); + tracing::error!("Self state: {}", serde_json::to_string(self_node)?); + + anyhow::bail!("Inconsistent state on {node_id}"); + } + } + + if expect_nodes.len() != self.nodes.len() { + // We just checked that all the expected nodes are present. If the lengths don't match, + // it means that we have nodes in Self that are unexpected. + for node_id in self.nodes.keys() { + if !expect_nodes.contains_key(node_id) { + anyhow::bail!("Node {node_id} found in Self but not in expected nodes"); + } + } + } + + Ok(()) + } + + /// Update the reference counts of a node. These reference counts are used to guide scheduling + /// decisions, not for memory management: they represent the number of tenant shard whose IntentState + /// targets this node and the number of tenants shars whose IntentState is attached to this + /// node. + /// + /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into + /// [`Self::new`] or [`Self::node_upsert`]) + pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) { + let Some(node) = self.nodes.get_mut(&node_id) else { + debug_assert!(false); + tracing::error!("Scheduler missing node {node_id}"); + return; + }; + + match update { + RefCountUpdate::PromoteSecondary => { + node.attached_shard_count += 1; + } + RefCountUpdate::Attach => { + node.shard_count += 1; + node.attached_shard_count += 1; + } + RefCountUpdate::Detach => { + node.shard_count -= 1; + node.attached_shard_count -= 1; + } + RefCountUpdate::DemoteAttached => { + node.attached_shard_count -= 1; + } + RefCountUpdate::AddSecondary => { + node.shard_count += 1; + } + RefCountUpdate::RemoveSecondary => { + node.shard_count -= 1; + } + } + } + + // Check if the number of shards attached to a given node is lagging below + // the cluster average. If that's the case, the node should be filled. + pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize { + let Some(node) = self.nodes.get(&node_id) else { + debug_assert!(false); + tracing::error!("Scheduler missing node {node_id}"); + return 0; + }; + assert!(!self.nodes.is_empty()); + let expected_attached_shards_per_node = self.expected_attached_shard_count(); + + for (node_id, node) in self.nodes.iter() { + tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node); + } + + if node.attached_shard_count < expected_attached_shards_per_node { + expected_attached_shards_per_node - node.attached_shard_count + } else { + 0 + } + } + + pub(crate) fn expected_attached_shard_count(&self) -> usize { + let total_attached_shards: usize = + self.nodes.values().map(|n| n.attached_shard_count).sum(); + + assert!(!self.nodes.is_empty()); + total_attached_shards / self.nodes.len() + } + + pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> { + self.nodes + .iter() + .map(|(node_id, stats)| (*node_id, stats.attached_shard_count)) + .sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse()) + .collect() + } + + pub(crate) fn node_upsert(&mut self, node: &Node) { + use std::collections::hash_map::Entry::*; + match self.nodes.entry(node.get_id()) { + Occupied(mut entry) => { + entry.get_mut().may_schedule = node.may_schedule(); + } + Vacant(entry) => { + entry.insert(SchedulerNode { + shard_count: 0, + attached_shard_count: 0, + may_schedule: node.may_schedule(), + }); + } + } + } + + pub(crate) fn node_remove(&mut self, node_id: NodeId) { + if self.nodes.remove(&node_id).is_none() { + tracing::warn!(node_id=%node_id, "Removed non-existent node from scheduler"); + } + } + + /// Where we have several nodes to choose from, for example when picking a secondary location + /// to promote to an attached location, this method may be used to pick the best choice based + /// on the scheduler's knowledge of utilization and availability. + /// + /// If the input is empty, or all the nodes are not elegible for scheduling, return None: the + /// caller can pick a node some other way. + pub(crate) fn node_preferred(&self, nodes: &[NodeId]) -> Option { + if nodes.is_empty() { + return None; + } + + // TODO: When the utilization score returned by the pageserver becomes meaningful, + // schedule based on that instead of the shard count. + let node = nodes + .iter() + .map(|node_id| { + let may_schedule = self + .nodes + .get(node_id) + .map(|n| n.may_schedule != MaySchedule::No) + .unwrap_or(false); + (*node_id, may_schedule) + }) + .max_by_key(|(_n, may_schedule)| *may_schedule); + + // If even the preferred node has may_schedule==false, return None + node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None }) + } + + /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they + /// are already in use by this shard -- we use this to avoid picking the same node + /// as both attached and secondary location. This is a hard constraint: if we cannot + /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`]. + /// + /// context: we prefer to avoid using nodes identified in the context, according + /// to their anti-affinity score. We use this to prefeer to avoid placing shards in + /// the same tenant on the same node. This is a soft constraint: the context will never + /// cause us to fail to schedule a shard. + pub(crate) fn schedule_shard( + &self, + hard_exclude: &[NodeId], + context: &ScheduleContext, + ) -> Result { + if self.nodes.is_empty() { + return Err(ScheduleError::NoPageservers); + } + + let mut scores: Vec<(NodeId, AffinityScore, usize)> = self + .nodes + .iter() + .filter_map(|(k, v)| { + if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No { + None + } else { + Some(( + *k, + context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE), + v.shard_count, + )) + } + }) + .collect(); + + // Sort by, in order of precedence: + // 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available + // 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes. + // 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems. + scores.sort_by_key(|i| (i.1, i.2, i.0)); + + if scores.is_empty() { + // After applying constraints, no pageservers were left. + if !matches!(context.mode, ScheduleMode::Speculative) { + // If this was not a speculative attempt, log details to understand why we couldn't + // schedule: this may help an engineer understand if some nodes are marked offline + // in a way that's preventing progress. + tracing::info!( + "Scheduling failure, while excluding {hard_exclude:?}, node states:" + ); + for (node_id, node) in &self.nodes { + tracing::info!( + "Node {node_id}: may_schedule={} shards={}", + node.may_schedule != MaySchedule::No, + node.shard_count + ); + } + } + return Err(ScheduleError::ImpossibleConstraint); + } + + // Lowest score wins + let node_id = scores.first().unwrap().0; + + if !matches!(context.mode, ScheduleMode::Speculative) { + tracing::info!( + "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})", + scores.iter().map(|i| i.0 .0).collect::>() + ); + } + + // Note that we do not update shard count here to reflect the scheduling: that + // is IntentState's job when the scheduled location is used. + + Ok(node_id) + } + + /// Unit test access to internal state + #[cfg(test)] + pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize { + self.nodes.get(&node_id).unwrap().shard_count + } + + #[cfg(test)] + pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize { + self.nodes.get(&node_id).unwrap().attached_shard_count + } +} + +#[cfg(test)] +pub(crate) mod test_utils { + + use crate::node::Node; + use pageserver_api::controller_api::{NodeAvailability, UtilizationScore}; + use std::collections::HashMap; + use utils::id::NodeId; + /// Test helper: synthesize the requested number of nodes, all in active state. + /// + /// Node IDs start at one. + pub(crate) fn make_test_nodes(n: u64) -> HashMap { + (1..n + 1) + .map(|i| { + (NodeId(i), { + let mut node = Node::new( + NodeId(i), + format!("httphost-{i}"), + 80 + i as u16, + format!("pghost-{i}"), + 5432 + i as u16, + ); + node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + assert!(node.is_available()); + node + }) + }) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::tenant_shard::IntentState; + #[test] + fn scheduler_basic() -> anyhow::Result<()> { + let nodes = test_utils::make_test_nodes(2); + + let mut scheduler = Scheduler::new(nodes.values()); + let mut t1_intent = IntentState::new(); + let mut t2_intent = IntentState::new(); + + let context = ScheduleContext::default(); + + let scheduled = scheduler.schedule_shard(&[], &context)?; + t1_intent.set_attached(&mut scheduler, Some(scheduled)); + let scheduled = scheduler.schedule_shard(&[], &context)?; + t2_intent.set_attached(&mut scheduler, Some(scheduled)); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + + let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?; + t1_intent.push_secondary(&mut scheduler, scheduled); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + + t1_intent.clear(&mut scheduler); + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1); + + let total_attached = scheduler.get_node_attached_shard_count(NodeId(1)) + + scheduler.get_node_attached_shard_count(NodeId(2)); + assert_eq!(total_attached, 1); + + if cfg!(debug_assertions) { + // Dropping an IntentState without clearing it causes a panic in debug mode, + // because we have failed to properly update scheduler shard counts. + let result = std::panic::catch_unwind(move || { + drop(t2_intent); + }); + assert!(result.is_err()); + } else { + t2_intent.clear(&mut scheduler); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0); + } + + Ok(()) + } +} diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs new file mode 100644 index 0000000000..ff37d0fe77 --- /dev/null +++ b/storage_controller/src/schema.rs @@ -0,0 +1,29 @@ +// @generated automatically by Diesel CLI. + +diesel::table! { + nodes (node_id) { + node_id -> Int8, + scheduling_policy -> Varchar, + listen_http_addr -> Varchar, + listen_http_port -> Int4, + listen_pg_addr -> Varchar, + listen_pg_port -> Int4, + } +} + +diesel::table! { + tenant_shards (tenant_id, shard_number, shard_count) { + tenant_id -> Varchar, + shard_number -> Int4, + shard_count -> Int4, + shard_stripe_size -> Int4, + generation -> Nullable, + generation_pageserver -> Nullable, + placement_policy -> Varchar, + splitting -> Int2, + config -> Text, + scheduling_policy -> Varchar, + } +} + +diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,); diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs new file mode 100644 index 0000000000..8475bf46d2 --- /dev/null +++ b/storage_controller/src/service.rs @@ -0,0 +1,5565 @@ +use std::{ + borrow::Cow, + cmp::Ordering, + collections::{BTreeMap, HashMap, HashSet}, + path::PathBuf, + str::FromStr, + sync::Arc, + time::{Duration, Instant}, +}; + +use crate::{ + background_node_operations::{ + Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION, + }, + compute_hook::NotifyError, + id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard}, + persistence::{AbortShardSplitStatus, TenantFilter}, + reconciler::{ReconcileError, ReconcileUnits}, + scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, + tenant_shard::{ + MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization, + ScheduleOptimizationAction, + }, +}; +use anyhow::Context; +use control_plane::storage_controller::{ + AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse, +}; +use diesel::result::DatabaseErrorKind; +use futures::{stream::FuturesUnordered, StreamExt}; +use itertools::Itertools; +use pageserver_api::{ + controller_api::{ + NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, + ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard, + TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, + TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, + UtilizationScore, + }, + models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest}, +}; +use reqwest::StatusCode; +use tracing::{instrument, Instrument}; + +use crate::pageserver_client::PageserverClient; +use pageserver_api::{ + models::{ + self, LocationConfig, LocationConfigListResponse, LocationConfigMode, + PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest, + TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, + TenantShardSplitRequest, TenantShardSplitResponse, TenantTimeTravelRequest, + TimelineCreateRequest, TimelineInfo, + }, + shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, + upcall_api::{ + ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, + ValidateResponse, ValidateResponseTenant, + }, +}; +use pageserver_client::mgmt_api; +use tokio::sync::mpsc::error::TrySendError; +use tokio_util::sync::CancellationToken; +use utils::{ + completion::Barrier, + failpoint_support, + generation::Generation, + http::error::ApiError, + id::{NodeId, TenantId, TimelineId}, + sync::gate::Gate, +}; + +use crate::{ + compute_hook::ComputeHook, + heartbeater::{Heartbeater, PageserverState}, + node::{AvailabilityTransition, Node}, + persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, + reconciler::attached_location_conf, + scheduler::Scheduler, + tenant_shard::{ + IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, + ReconcilerWaiter, TenantShard, + }, +}; + +// For operations that should be quick, like attaching a new tenant +const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); + +// For operations that might be slow, like migrating a tenant with +// some data in it. +pub const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + +// If we receive a call using Secondary mode initially, it will omit generation. We will initialize +// tenant shards into this generation, and as long as it remains in this generation, we will accept +// input generation from future requests as authoritative. +const INITIAL_GENERATION: Generation = Generation::new(0); + +/// How long [`Service::startup_reconcile`] is allowed to take before it should give +/// up on unresponsive pageservers and proceed. +pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + +/// How long a node may be unresponsive to heartbeats before we declare it offline. +/// This must be long enough to cover node restarts as well as normal operations: in future +/// it should be separated into distinct timeouts for startup vs. normal operation +/// (``) +pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); + +#[derive(Clone, strum_macros::Display)] +enum TenantOperations { + Create, + LocationConfig, + ConfigSet, + TimeTravelRemoteStorage, + Delete, + UpdatePolicy, + ShardSplit, + SecondaryDownload, + TimelineCreate, + TimelineDelete, +} + +#[derive(Clone, strum_macros::Display)] +enum NodeOperations { + Register, + Configure, +} + +pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; + +// Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately. +// This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly +// than they're being pushed onto the queue. +const MAX_DELAYED_RECONCILES: usize = 10000; + +// Top level state available to all HTTP handlers +struct ServiceState { + tenants: BTreeMap, + + nodes: Arc>, + + scheduler: Scheduler, + + /// Ongoing background operation on the cluster if any is running. + /// Note that only one such operation may run at any given time, + /// hence the type choice. + ongoing_operation: Option, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, +} + +/// Transform an error from a pageserver into an error to return to callers of a storage +/// controller API. +fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { + match e { + mgmt_api::Error::ReceiveErrorBody(str) => { + // Presume errors receiving body are connectivity/availability issues + ApiError::ResourceUnavailable( + format!("{node} error receiving error body: {str}").into(), + ) + } + mgmt_api::Error::ReceiveBody(str) => { + // Presume errors receiving body are connectivity/availability issues + ApiError::ResourceUnavailable(format!("{node} error receiving body: {str}").into()) + } + mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => { + ApiError::NotFound(anyhow::anyhow!(format!("{node}: {msg}")).into()) + } + mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg) => { + ApiError::ResourceUnavailable(format!("{node}: {msg}").into()) + } + mgmt_api::Error::ApiError(status @ StatusCode::UNAUTHORIZED, msg) + | mgmt_api::Error::ApiError(status @ StatusCode::FORBIDDEN, msg) => { + // Auth errors talking to a pageserver are not auth errors for the caller: they are + // internal server errors, showing that something is wrong with the pageserver or + // storage controller's auth configuration. + ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}")) + } + mgmt_api::Error::ApiError(status, msg) => { + // Presume general case of pageserver API errors is that we tried to do something + // that can't be done right now. + ApiError::Conflict(format!("{node} {status}: {status} {msg}")) + } + mgmt_api::Error::Cancelled => ApiError::ShuttingDown, + } +} + +impl ServiceState { + fn new( + nodes: HashMap, + tenants: BTreeMap, + scheduler: Scheduler, + delayed_reconcile_rx: tokio::sync::mpsc::Receiver, + ) -> Self { + Self { + tenants, + nodes: Arc::new(nodes), + scheduler, + ongoing_operation: None, + delayed_reconcile_rx, + } + } + + fn parts_mut( + &mut self, + ) -> ( + &mut Arc>, + &mut BTreeMap, + &mut Scheduler, + ) { + (&mut self.nodes, &mut self.tenants, &mut self.scheduler) + } +} + +#[derive(Clone)] +pub struct Config { + // All pageservers managed by one instance of this service must have + // the same public key. This JWT token will be used to authenticate + // this service to the pageservers it manages. + pub jwt_token: Option, + + // This JWT token will be used to authenticate this service to the control plane. + pub control_plane_jwt_token: Option, + + /// Where the compute hook should send notifications of pageserver attachment locations + /// (this URL points to the control plane in prod). If this is None, the compute hook will + /// assume it is running in a test environment and try to update neon_local. + pub compute_hook_url: Option, + + /// Grace period within which a pageserver does not respond to heartbeats, but is still + /// considered active. Once the grace period elapses, the next heartbeat failure will + /// mark the pagseserver offline. + pub max_unavailable_interval: Duration, + + /// How many Reconcilers may be spawned concurrently + pub reconciler_concurrency: usize, + + /// How large must a shard grow in bytes before we split it? + /// None disables auto-splitting. + pub split_threshold: Option, + + // TODO: make this cfg(feature = "testing") + pub neon_local_repo_dir: Option, +} + +impl From for ApiError { + fn from(err: DatabaseError) -> ApiError { + match err { + DatabaseError::Query(e) => ApiError::InternalServerError(e.into()), + // FIXME: ApiError doesn't have an Unavailable variant, but ShuttingDown maps to 503. + DatabaseError::Connection(_) | DatabaseError::ConnectionPool(_) => { + ApiError::ShuttingDown + } + DatabaseError::Logical(reason) => { + ApiError::InternalServerError(anyhow::anyhow!(reason)) + } + } + } +} + +pub struct Service { + inner: Arc>, + config: Config, + persistence: Arc, + compute_hook: Arc, + result_tx: tokio::sync::mpsc::UnboundedSender, + + heartbeater: Heartbeater, + + // Channel for background cleanup from failed operations that require cleanup, such as shard split + abort_tx: tokio::sync::mpsc::UnboundedSender, + + // Locking on a tenant granularity (covers all shards in the tenant): + // - Take exclusively for rare operations that mutate the tenant's persistent state (e.g. create/delete/split) + // - Take in shared mode for operations that need the set of shards to stay the same to complete reliably (e.g. timeline CRUD) + tenant_op_locks: IdLockMap, + + // Locking for node-mutating operations: take exclusively for operations that modify the node's persistent state, or + // that transition it to/from Active. + node_op_locks: IdLockMap, + + // Limit how many Reconcilers we will spawn concurrently + reconciler_concurrency: Arc, + + /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile + /// Send into this queue to promptly attempt to reconcile this shard next time units are available. + /// + /// Note that this state logically lives inside ServiceInner, but carrying Sender here makes the code simpler + /// by avoiding needing a &mut ref to something inside the ServiceInner. This could be optimized to + /// use a VecDeque instead of a channel to reduce synchronization overhead, at the cost of some code complexity. + delayed_reconcile_tx: tokio::sync::mpsc::Sender, + + // Process shutdown will fire this token + cancel: CancellationToken, + + // Background tasks will hold this gate + gate: Gate, + + /// This waits for initial reconciliation with pageservers to complete. Until this barrier + /// passes, it isn't safe to do any actions that mutate tenants. + pub(crate) startup_complete: Barrier, +} + +impl From for ApiError { + fn from(value: ReconcileWaitError) -> Self { + match value { + ReconcileWaitError::Shutdown => ApiError::ShuttingDown, + e @ ReconcileWaitError::Timeout(_) => ApiError::Timeout(format!("{e}").into()), + e @ ReconcileWaitError::Failed(..) => ApiError::InternalServerError(anyhow::anyhow!(e)), + } + } +} + +impl From for ApiError { + fn from(value: OperationError) -> Self { + match value { + OperationError::NodeStateChanged(err) | OperationError::FinalizeError(err) => { + ApiError::InternalServerError(anyhow::anyhow!(err)) + } + OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()), + } + } +} + +#[allow(clippy::large_enum_variant)] +enum TenantCreateOrUpdate { + Create(TenantCreateRequest), + Update(Vec), +} + +struct ShardSplitParams { + old_shard_count: ShardCount, + new_shard_count: ShardCount, + new_stripe_size: Option, + targets: Vec, + policy: PlacementPolicy, + config: TenantConfig, + shard_ident: ShardIdentity, +} + +// When preparing for a shard split, we may either choose to proceed with the split, +// or find that the work is already done and return NoOp. +enum ShardSplitAction { + Split(ShardSplitParams), + NoOp(TenantShardSplitResponse), +} + +// A parent shard which will be split +struct ShardSplitTarget { + parent_id: TenantShardId, + node: Node, + child_ids: Vec, +} + +/// When we tenant shard split operation fails, we may not be able to clean up immediately, because nodes +/// might not be available. We therefore use a queue of abort operations processed in the background. +struct TenantShardSplitAbort { + tenant_id: TenantId, + /// The target values from the request that failed + new_shard_count: ShardCount, + new_stripe_size: Option, + /// Until this abort op is complete, no other operations may be done on the tenant + _tenant_lock: WrappedWriteGuard, +} + +#[derive(thiserror::Error, Debug)] +enum TenantShardSplitAbortError { + #[error(transparent)] + Database(#[from] DatabaseError), + #[error(transparent)] + Remote(#[from] mgmt_api::Error), + #[error("Unavailable")] + Unavailable, +} + +struct ShardUpdate { + tenant_shard_id: TenantShardId, + placement_policy: PlacementPolicy, + tenant_config: TenantConfig, + + /// If this is None, generation is not updated. + generation: Option, +} + +impl Service { + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date + /// view of the world, and determine which pageservers are responsive. + #[instrument(skip_all)] + async fn startup_reconcile( + self: &Arc, + bg_compute_notify_result_tx: tokio::sync::mpsc::Sender< + Result<(), (TenantShardId, NotifyError)>, + >, + ) { + // For all tenant shards, a vector of observed states on nodes (where None means + // indeterminate, same as in [`ObservedStateLocation`]) + let mut observed: HashMap)>> = + HashMap::new(); + + // Startup reconciliation does I/O to other services: whether they + // are responsive or not, we should aim to finish within our deadline, because: + // - If we don't, a k8s readiness hook watching /ready will kill us. + // - While we're waiting for startup reconciliation, we are not fully + // available for end user operations like creating/deleting tenants and timelines. + // + // We set multiple deadlines to break up the time available between the phases of work: this is + // arbitrary, but avoids a situation where the first phase could burn our entire timeout period. + let start_at = Instant::now(); + let node_scan_deadline = start_at + .checked_add(STARTUP_RECONCILE_TIMEOUT / 2) + .expect("Reconcile timeout is a modest constant"); + + // Accumulate a list of any tenant locations that ought to be detached + let mut cleanup = Vec::new(); + + let node_listings = self.scan_node_locations(node_scan_deadline).await; + // Send initial heartbeat requests to nodes that replied to the location listing above. + let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await; + + for (node_id, list_response) in node_listings { + let tenant_shards = list_response.tenant_shards; + tracing::info!( + "Received {} shard statuses from pageserver {}, setting it to Active", + tenant_shards.len(), + node_id + ); + + for (tenant_shard_id, conf_opt) in tenant_shards { + let shard_observations = observed.entry(tenant_shard_id).or_default(); + shard_observations.push((node_id, conf_opt)); + } + } + + // List of tenants for which we will attempt to notify compute of their location at startup + let mut compute_notifications = Vec::new(); + + // Populate intent and observed states for all tenants, based on reported state on pageservers + tracing::info!("Populating tenant shards' states from initial pageserver scan..."); + let shard_count = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + // Mark nodes online if they responded to us: nodes are offline by default after a restart. + let mut new_nodes = (**nodes).clone(); + for (node_id, node) in new_nodes.iter_mut() { + if let Some(utilization) = nodes_online.get(node_id) { + node.set_availability(NodeAvailability::Active(UtilizationScore( + utilization.utilization_score, + ))); + scheduler.node_upsert(node); + } + } + *nodes = Arc::new(new_nodes); + + for (tenant_shard_id, shard_observations) in observed { + for (node_id, observed_loc) in shard_observations { + let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { + cleanup.push((tenant_shard_id, node_id)); + continue; + }; + tenant_shard + .observed + .locations + .insert(node_id, ObservedStateLocation { conf: observed_loc }); + } + } + + // Populate each tenant's intent state + let mut schedule_context = ScheduleContext::default(); + for (tenant_shard_id, tenant_shard) in tenants.iter_mut() { + if tenant_shard_id.shard_number == ShardNumber(0) { + // Reset scheduling context each time we advance to the next Tenant + schedule_context = ScheduleContext::default(); + } + + tenant_shard.intent_from_observed(scheduler); + if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) { + // Non-fatal error: we are unable to properly schedule the tenant, perhaps because + // not enough pageservers are available. The tenant may well still be available + // to clients. + tracing::error!("Failed to schedule tenant {tenant_shard_id} at startup: {e}"); + } else { + // If we're both intending and observed to be attached at a particular node, we will + // emit a compute notification for this. In the case where our observed state does not + // yet match our intent, we will eventually reconcile, and that will emit a compute notification. + if let Some(attached_at) = tenant_shard.stably_attached() { + compute_notifications.push(( + *tenant_shard_id, + attached_at, + tenant_shard.shard.stripe_size, + )); + } + } + } + + tenants.len() + }; + + // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that + // generation_pageserver in the database. + + // Emit compute hook notifications for all tenants which are already stably attached. Other tenants + // will emit compute hook notifications when they reconcile. + // + // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later + // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later + // calls will be correctly ordered wrt these. + // + // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them + // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore + // unit and start doing I/O. + tracing::info!( + "Sending {} compute notifications", + compute_notifications.len() + ); + self.compute_hook.notify_background( + compute_notifications, + bg_compute_notify_result_tx.clone(), + &self.cancel, + ); + + // Finally, now that the service is up and running, launch reconcile operations for any tenants + // which require it: under normal circumstances this should only include tenants that were in some + // transient state before we restarted, or any tenants whose compute hooks failed above. + tracing::info!("Checking for shards in need of reconciliation..."); + let reconcile_tasks = self.reconcile_all(); + // We will not wait for these reconciliation tasks to run here: we're now done with startup and + // normal operations may proceed. + + // Clean up any tenants that were found on pageservers but are not known to us. Do this in the + // background because it does not need to complete in order to proceed with other work. + if !cleanup.is_empty() { + tracing::info!("Cleaning up {} locations in the background", cleanup.len()); + tokio::task::spawn({ + let cleanup_self = self.clone(); + async move { cleanup_self.cleanup_locations(cleanup).await } + }); + } + + tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)"); + } + + async fn initial_heartbeat_round<'a>( + &self, + node_ids: impl Iterator, + ) -> HashMap { + assert!(!self.startup_complete.is_ready()); + + let all_nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + + let mut nodes_to_heartbeat = HashMap::new(); + for node_id in node_ids { + match all_nodes.get(node_id) { + Some(node) => { + nodes_to_heartbeat.insert(*node_id, node.clone()); + } + None => { + tracing::warn!("Node {node_id} was removed during start-up"); + } + } + } + + tracing::info!("Sending initial heartbeats..."); + let res = self + .heartbeater + .heartbeat(Arc::new(nodes_to_heartbeat)) + .await; + + let mut online_nodes = HashMap::new(); + if let Ok(deltas) = res { + for (node_id, status) in deltas.0 { + match status { + PageserverState::Available { utilization, .. } => { + online_nodes.insert(node_id, utilization); + } + PageserverState::Offline => {} + } + } + } + + online_nodes + } + + /// Used during [`Self::startup_reconcile`]: issue GETs to all nodes concurrently, with a deadline. + /// + /// The result includes only nodes which responded within the deadline + async fn scan_node_locations( + &self, + deadline: Instant, + ) -> HashMap { + let nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + + let mut node_results = HashMap::new(); + + let mut node_list_futs = FuturesUnordered::new(); + + tracing::info!("Scanning shards on {} nodes...", nodes.len()); + for node in nodes.values() { + node_list_futs.push({ + async move { + tracing::info!("Scanning shards on node {node}..."); + let timeout = Duration::from_secs(1); + let response = node + .with_client_retries( + |client| async move { client.list_location_config().await }, + &self.config.jwt_token, + 1, + 5, + timeout, + &self.cancel, + ) + .await; + (node.get_id(), response) + } + }); + } + + loop { + let (node_id, result) = tokio::select! { + next = node_list_futs.next() => { + match next { + Some(result) => result, + None =>{ + // We got results for all our nodes + break; + } + + } + }, + _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => { + // Give up waiting for anyone who hasn't responded: we will yield the results that we have + tracing::info!("Reached deadline while waiting for nodes to respond to location listing requests"); + break; + } + }; + + let Some(list_response) = result else { + tracing::info!("Shutdown during startup_reconcile"); + break; + }; + + match list_response { + Err(e) => { + tracing::warn!("Could not scan node {} ({e})", node_id); + } + Ok(listing) => { + node_results.insert(node_id, listing); + } + } + } + + node_results + } + + /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers. + /// + /// This is safe to run in the background, because if we don't have this TenantShardId in our map of + /// tenants, then it is probably something incompletely deleted before: we will not fight with any + /// other task trying to attach it. + #[instrument(skip_all)] + async fn cleanup_locations(&self, cleanup: Vec<(TenantShardId, NodeId)>) { + let nodes = self.inner.read().unwrap().nodes.clone(); + + for (tenant_shard_id, node_id) in cleanup { + // A node reported a tenant_shard_id which is unknown to us: detach it. + let Some(node) = nodes.get(&node_id) else { + // This is legitimate; we run in the background and [`Self::startup_reconcile`] might have identified + // a location to clean up on a node that has since been removed. + tracing::info!( + "Not cleaning up location {node_id}/{tenant_shard_id}: node not found" + ); + continue; + }; + + if self.cancel.is_cancelled() { + break; + } + + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + match client + .location_config( + tenant_shard_id, + LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: tenant_shard_id.shard_number.0, + shard_count: tenant_shard_id.shard_count.literal(), + shard_stripe_size: 0, + tenant_conf: models::TenantConfig::default(), + }, + None, + false, + ) + .await + { + Ok(()) => { + tracing::info!( + "Detached unknown shard {tenant_shard_id} on pageserver {node_id}" + ); + } + Err(e) => { + // Non-fatal error: leaving a tenant shard behind that we are not managing shouldn't + // break anything. + tracing::error!( + "Failed to detach unknkown shard {tenant_shard_id} on pageserver {node_id}: {e}" + ); + } + } + } + } + + /// Long running background task that periodically wakes up and looks for shards that need + /// reconciliation. Reconciliation is fallible, so any reconciliation tasks that fail during + /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible + /// for those retries. + #[instrument(skip_all)] + async fn background_reconcile(self: &Arc) { + self.startup_complete.clone().wait().await; + + const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20); + + let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD); + while !self.cancel.is_cancelled() { + tokio::select! { + _ = interval.tick() => { + let reconciles_spawned = self.reconcile_all(); + if reconciles_spawned == 0 { + // Run optimizer only when we didn't find any other work to do + let optimizations = self.optimize_all().await; + if optimizations == 0 { + // Run new splits only when no optimizations are pending + self.autosplit_tenants().await; + } + } + } + _ = self.cancel.cancelled() => return + } + } + } + #[instrument(skip_all)] + async fn spawn_heartbeat_driver(&self) { + self.startup_complete.clone().wait().await; + + const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5); + + let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL); + while !self.cancel.is_cancelled() { + tokio::select! { + _ = interval.tick() => { } + _ = self.cancel.cancelled() => return + }; + + let nodes = { + let locked = self.inner.read().unwrap(); + locked.nodes.clone() + }; + + let res = self.heartbeater.heartbeat(nodes).await; + if let Ok(deltas) = res { + for (node_id, state) in deltas.0 { + let (new_node, new_availability) = match state { + PageserverState::Available { + utilization, new, .. + } => ( + new, + NodeAvailability::Active(UtilizationScore( + utilization.utilization_score, + )), + ), + PageserverState::Offline => (false, NodeAvailability::Offline), + }; + + if new_node { + // When the heartbeats detect a newly added node, we don't wish + // to attempt to reconcile the shards assigned to it. The node + // is likely handling it's re-attach response, so reconciling now + // would be counterproductive. + // + // Instead, update the in-memory state with the details learned about the + // node. + let mut locked = self.inner.write().unwrap(); + let (nodes, _tenants, scheduler) = locked.parts_mut(); + + let mut new_nodes = (**nodes).clone(); + + if let Some(node) = new_nodes.get_mut(&node_id) { + node.set_availability(new_availability); + scheduler.node_upsert(node); + } + + locked.nodes = Arc::new(new_nodes); + } else { + // This is the code path for geniune availability transitions (i.e node + // goes unavailable and/or comes back online). + let res = self + .node_configure(node_id, Some(new_availability), None) + .await; + + match res { + Ok(()) => {} + Err(ApiError::NotFound(_)) => { + // This should be rare, but legitimate since the heartbeats are done + // on a snapshot of the nodes. + tracing::info!( + "Node {} was not found after heartbeat round", + node_id + ); + } + Err(err) => { + tracing::error!( + "Failed to update node {} after heartbeat round: {}", + node_id, + err + ); + } + } + } + } + } + } + } + + /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation + /// was successful and intent hasn't changed since the Reconciler was spawned, this will update + /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`] + /// will indicate that reconciliation is not needed. + #[instrument(skip_all, fields( + tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), + sequence=%result.sequence + ))] + fn process_result(&self, result: ReconcileResult) { + let mut locked = self.inner.write().unwrap(); + let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else { + // A reconciliation result might race with removing a tenant: drop results for + // tenants that aren't in our map. + return; + }; + + // Usually generation should only be updated via this path, so the max() isn't + // needed, but it is used to handle out-of-band updates via. e.g. test hook. + tenant.generation = std::cmp::max(tenant.generation, result.generation); + + // If the reconciler signals that it failed to notify compute, set this state on + // the shard so that a future [`TenantShard::maybe_reconcile`] will try again. + tenant.pending_compute_notification = result.pending_compute_notification; + + // Let the TenantShard know it is idle. + tenant.reconcile_complete(result.sequence); + + match result.result { + Ok(()) => { + for (node_id, loc) in &result.observed.locations { + if let Some(conf) = &loc.conf { + tracing::info!("Updating observed location {}: {:?}", node_id, conf); + } else { + tracing::info!("Setting observed location {} to None", node_id,) + } + } + tenant.observed = result.observed; + tenant.waiter.advance(result.sequence); + } + Err(e) => { + match e { + ReconcileError::Cancel => { + tracing::info!("Reconciler was cancelled"); + } + ReconcileError::Remote(mgmt_api::Error::Cancelled) => { + // This might be due to the reconciler getting cancelled, or it might + // be due to the `Node` being marked offline. + tracing::info!("Reconciler cancelled during pageserver API call"); + } + _ => { + tracing::warn!("Reconcile error: {}", e); + } + } + + // Ordering: populate last_error before advancing error_seq, + // so that waiters will see the correct error after waiting. + tenant.set_last_error(result.sequence, e); + + for (node_id, o) in result.observed.locations { + tenant.observed.locations.insert(node_id, o); + } + } + } + + // Maybe some other work can proceed now that this job finished. + if self.reconciler_concurrency.available_permits() > 0 { + while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() { + let (nodes, tenants, _scheduler) = locked.parts_mut(); + if let Some(shard) = tenants.get_mut(&tenant_shard_id) { + shard.delayed_reconcile = false; + self.maybe_reconcile_shard(shard, nodes); + } + + if self.reconciler_concurrency.available_permits() == 0 { + break; + } + } + } + } + + async fn process_results( + &self, + mut result_rx: tokio::sync::mpsc::UnboundedReceiver, + mut bg_compute_hook_result_rx: tokio::sync::mpsc::Receiver< + Result<(), (TenantShardId, NotifyError)>, + >, + ) { + loop { + // Wait for the next result, or for cancellation + tokio::select! { + r = result_rx.recv() => { + match r { + Some(result) => {self.process_result(result);}, + None => {break;} + } + } + _ = async{ + match bg_compute_hook_result_rx.recv().await { + Some(result) => { + if let Err((tenant_shard_id, notify_error)) = result { + tracing::warn!("Marking shard {tenant_shard_id} for notification retry, due to error {notify_error}"); + let mut locked = self.inner.write().unwrap(); + if let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) { + shard.pending_compute_notification = true; + } + + } + }, + None => { + // This channel is dead, but we don't want to terminate the outer loop{}: just wait for shutdown + self.cancel.cancelled().await; + } + } + } => {}, + _ = self.cancel.cancelled() => { + break; + } + }; + } + + // We should only fall through on shutdown + assert!(self.cancel.is_cancelled()); + } + + async fn process_aborts( + &self, + mut abort_rx: tokio::sync::mpsc::UnboundedReceiver, + ) { + loop { + // Wait for the next result, or for cancellation + let op = tokio::select! { + r = abort_rx.recv() => { + match r { + Some(op) => {op}, + None => {break;} + } + } + _ = self.cancel.cancelled() => { + break; + } + }; + + // Retry until shutdown: we must keep this request object alive until it is properly + // processed, as it holds a lock guard that prevents other operations trying to do things + // to the tenant while it is in a weird part-split state. + while !self.cancel.is_cancelled() { + match self.abort_tenant_shard_split(&op).await { + Ok(_) => break, + Err(e) => { + tracing::warn!( + "Failed to abort shard split on {}, will retry: {e}", + op.tenant_id + ); + + // If a node is unavailable, we hope that it has been properly marked Offline + // when we retry, so that the abort op will succeed. If the abort op is failing + // for some other reason, we will keep retrying forever, or until a human notices + // and does something about it (either fixing a pageserver or restarting the controller). + tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled()) + .await + .ok(); + } + } + } + } + } + + pub async fn spawn(config: Config, persistence: Arc) -> anyhow::Result> { + let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel(); + let (abort_tx, abort_rx) = tokio::sync::mpsc::unbounded_channel(); + + tracing::info!("Loading nodes from database..."); + let nodes = persistence + .list_nodes() + .await? + .into_iter() + .map(Node::from_persistent) + .collect::>(); + let nodes: HashMap = nodes.into_iter().map(|n| (n.get_id(), n)).collect(); + tracing::info!("Loaded {} nodes from database.", nodes.len()); + + tracing::info!("Loading shards from database..."); + let mut tenant_shard_persistence = persistence.list_tenant_shards().await?; + tracing::info!( + "Loaded {} shards from database.", + tenant_shard_persistence.len() + ); + + // If any shard splits were in progress, reset the database state to abort them + let mut tenant_shard_count_min_max: HashMap = + HashMap::new(); + for tsp in &mut tenant_shard_persistence { + let shard = tsp.get_shard_identity()?; + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let entry = tenant_shard_count_min_max + .entry(tenant_shard_id.tenant_id) + .or_insert_with(|| (shard.count, shard.count)); + entry.0 = std::cmp::min(entry.0, shard.count); + entry.1 = std::cmp::max(entry.1, shard.count); + } + + for (tenant_id, (count_min, count_max)) in tenant_shard_count_min_max { + if count_min != count_max { + // Aborting the split in the database and dropping the child shards is sufficient: the reconciliation in + // [`Self::startup_reconcile`] will implicitly drop the child shards on remote pageservers, or they'll + // be dropped later in [`Self::node_activate_reconcile`] if it isn't available right now. + tracing::info!("Aborting shard split {tenant_id} {count_min:?} -> {count_max:?}"); + let abort_status = persistence.abort_shard_split(tenant_id, count_max).await?; + + // We may never see the Complete status here: if the split was complete, we wouldn't have + // identified this tenant has having mismatching min/max counts. + assert!(matches!(abort_status, AbortShardSplitStatus::Aborted)); + + // Clear the splitting status in-memory, to reflect that we just aborted in the database + tenant_shard_persistence.iter_mut().for_each(|tsp| { + // Set idle split state on those shards that we will retain. + let tsp_tenant_id = TenantId::from_str(tsp.tenant_id.as_str()).unwrap(); + if tsp_tenant_id == tenant_id + && tsp.get_shard_identity().unwrap().count == count_min + { + tsp.splitting = SplitState::Idle; + } else if tsp_tenant_id == tenant_id { + // Leave the splitting state on the child shards: this will be used next to + // drop them. + tracing::info!( + "Shard {tsp_tenant_id} will be dropped after shard split abort", + ); + } + }); + + // Drop shards for this tenant which we didn't just mark idle (i.e. child shards of the aborted split) + tenant_shard_persistence.retain(|tsp| { + TenantId::from_str(tsp.tenant_id.as_str()).unwrap() != tenant_id + || tsp.splitting == SplitState::Idle + }); + } + } + + let mut tenants = BTreeMap::new(); + + let mut scheduler = Scheduler::new(nodes.values()); + + #[cfg(feature = "testing")] + { + // Hack: insert scheduler state for all nodes referenced by shards, as compatibility + // tests only store the shards, not the nodes. The nodes will be loaded shortly + // after when pageservers start up and register. + let mut node_ids = HashSet::new(); + for tsp in &tenant_shard_persistence { + if let Some(node_id) = tsp.generation_pageserver { + node_ids.insert(node_id); + } + } + for node_id in node_ids { + tracing::info!("Creating node {} in scheduler for tests", node_id); + let node = Node::new( + NodeId(node_id as u64), + "".to_string(), + 123, + "".to_string(), + 123, + ); + + scheduler.node_upsert(&node); + } + } + for tsp in tenant_shard_persistence { + let tenant_shard_id = tsp.get_tenant_shard_id()?; + + // We will populate intent properly later in [`Self::startup_reconcile`], initially populate + // it with what we can infer: the node for which a generation was most recently issued. + let mut intent = IntentState::new(); + if let Some(generation_pageserver) = tsp.generation_pageserver { + intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64))); + } + let new_tenant = TenantShard::from_persistent(tsp, intent)?; + + tenants.insert(tenant_shard_id, new_tenant); + } + + let (startup_completion, startup_complete) = utils::completion::channel(); + + // This channel is continuously consumed by process_results, so doesn't need to be very large. + let (bg_compute_notify_result_tx, bg_compute_notify_result_rx) = + tokio::sync::mpsc::channel(512); + + let (delayed_reconcile_tx, delayed_reconcile_rx) = + tokio::sync::mpsc::channel(MAX_DELAYED_RECONCILES); + + let cancel = CancellationToken::new(); + let heartbeater = Heartbeater::new( + config.jwt_token.clone(), + config.max_unavailable_interval, + cancel.clone(), + ); + let this = Arc::new(Self { + inner: Arc::new(std::sync::RwLock::new(ServiceState::new( + nodes, + tenants, + scheduler, + delayed_reconcile_rx, + ))), + config: config.clone(), + persistence, + compute_hook: Arc::new(ComputeHook::new(config.clone())), + result_tx, + heartbeater, + reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new( + config.reconciler_concurrency, + )), + delayed_reconcile_tx, + abort_tx, + startup_complete: startup_complete.clone(), + cancel, + gate: Gate::default(), + tenant_op_locks: Default::default(), + node_op_locks: Default::default(), + }); + + let result_task_this = this.clone(); + tokio::task::spawn(async move { + // Block shutdown until we're done (we must respect self.cancel) + if let Ok(_gate) = result_task_this.gate.enter() { + result_task_this + .process_results(result_rx, bg_compute_notify_result_rx) + .await + } + }); + + tokio::task::spawn({ + let this = this.clone(); + async move { + // Block shutdown until we're done (we must respect self.cancel) + if let Ok(_gate) = this.gate.enter() { + this.process_aborts(abort_rx).await + } + } + }); + + tokio::task::spawn({ + let this = this.clone(); + async move { + if let Ok(_gate) = this.gate.enter() { + loop { + tokio::select! { + _ = this.cancel.cancelled() => { + break; + }, + _ = tokio::time::sleep(Duration::from_secs(60)) => {} + }; + this.tenant_op_locks.housekeeping(); + } + } + } + }); + + tokio::task::spawn({ + let this = this.clone(); + // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`] + // is done. + let startup_completion = startup_completion.clone(); + async move { + // Block shutdown until we're done (we must respect self.cancel) + let Ok(_gate) = this.gate.enter() else { + return; + }; + + this.startup_reconcile(bg_compute_notify_result_tx).await; + drop(startup_completion); + } + }); + + tokio::task::spawn({ + let this = this.clone(); + let startup_complete = startup_complete.clone(); + async move { + startup_complete.wait().await; + this.background_reconcile().await; + } + }); + + tokio::task::spawn({ + let this = this.clone(); + let startup_complete = startup_complete.clone(); + async move { + startup_complete.wait().await; + this.spawn_heartbeat_driver().await; + } + }); + + Ok(this) + } + + pub(crate) async fn attach_hook( + &self, + attach_req: AttachHookRequest, + ) -> anyhow::Result { + // This is a test hook. To enable using it on tenants that were created directly with + // the pageserver API (not via this service), we will auto-create any missing tenant + // shards with default state. + let insert = { + let locked = self.inner.write().unwrap(); + !locked.tenants.contains_key(&attach_req.tenant_shard_id) + }; + + if insert { + let tsp = TenantShardPersistence { + tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), + shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, + shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32, + shard_stripe_size: 0, + generation: attach_req.generation_override.or(Some(0)), + generation_pageserver: None, + placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), + config: serde_json::to_string(&TenantConfig::default()).unwrap(), + splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), + }; + + match self.persistence.insert_tenant_shards(vec![tsp]).await { + Err(e) => match e { + DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + )) => { + tracing::info!( + "Raced with another request to insert tenant {}", + attach_req.tenant_shard_id + ) + } + _ => return Err(e.into()), + }, + Ok(()) => { + tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id); + + let mut locked = self.inner.write().unwrap(); + locked.tenants.insert( + attach_req.tenant_shard_id, + TenantShard::new( + attach_req.tenant_shard_id, + ShardIdentity::unsharded(), + PlacementPolicy::Attached(0), + ), + ); + tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); + } + } + } + + let new_generation = if let Some(req_node_id) = attach_req.node_id { + let maybe_tenant_conf = { + let locked = self.inner.write().unwrap(); + locked + .tenants + .get(&attach_req.tenant_shard_id) + .map(|t| t.config.clone()) + }; + + match maybe_tenant_conf { + Some(conf) => { + let new_generation = self + .persistence + .increment_generation(attach_req.tenant_shard_id, req_node_id) + .await?; + + // Persist the placement policy update. This is required + // when we reattaching a detached tenant. + self.persistence + .update_tenant_shard( + TenantFilter::Shard(attach_req.tenant_shard_id), + Some(PlacementPolicy::Attached(0)), + Some(conf), + None, + None, + ) + .await?; + Some(new_generation) + } + None => { + anyhow::bail!("Attach hook handling raced with tenant removal") + } + } + } else { + self.persistence.detach(attach_req.tenant_shard_id).await?; + None + }; + + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + let tenant_shard = tenants + .get_mut(&attach_req.tenant_shard_id) + .expect("Checked for existence above"); + + if let Some(new_generation) = new_generation { + tenant_shard.generation = Some(new_generation); + tenant_shard.policy = PlacementPolicy::Attached(0); + } else { + // This is a detach notification. We must update placement policy to avoid re-attaching + // during background scheduling/reconciliation, or during storage controller restart. + assert!(attach_req.node_id.is_none()); + tenant_shard.policy = PlacementPolicy::Detached; + } + + if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { + tracing::info!( + tenant_id = %attach_req.tenant_shard_id, + ps_id = %attaching_pageserver, + generation = ?tenant_shard.generation, + "issuing", + ); + } else if let Some(ps_id) = tenant_shard.intent.get_attached() { + tracing::info!( + tenant_id = %attach_req.tenant_shard_id, + %ps_id, + generation = ?tenant_shard.generation, + "dropping", + ); + } else { + tracing::info!( + tenant_id = %attach_req.tenant_shard_id, + "no-op: tenant already has no pageserver"); + } + tenant_shard + .intent + .set_attached(scheduler, attach_req.node_id); + + tracing::info!( + "attach_hook: tenant {} set generation {:?}, pageserver {}", + attach_req.tenant_shard_id, + tenant_shard.generation, + // TODO: this is an odd number of 0xf's + attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) + ); + + // Trick the reconciler into not doing anything for this tenant: this helps + // tests that manually configure a tenant on the pagesrever, and then call this + // attach hook: they don't want background reconciliation to modify what they + // did to the pageserver. + #[cfg(feature = "testing")] + { + if let Some(node_id) = attach_req.node_id { + tenant_shard.observed.locations = HashMap::from([( + node_id, + ObservedStateLocation { + conf: Some(attached_location_conf( + tenant_shard.generation.unwrap(), + &tenant_shard.shard, + &tenant_shard.config, + false, + )), + }, + )]); + } else { + tenant_shard.observed.locations.clear(); + } + } + + Ok(AttachHookResponse { + gen: attach_req + .node_id + .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), + }) + } + + pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse { + let locked = self.inner.read().unwrap(); + + let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id); + + InspectResponse { + attachment: tenant_shard.and_then(|s| { + s.intent + .get_attached() + .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps)) + }), + } + } + + // When the availability state of a node transitions to active, we must do a full reconciliation + // of LocationConfigs on that node. This is because while a node was offline: + // - we might have proceeded through startup_reconcile without checking for extraneous LocationConfigs on this node + // - aborting a tenant shard split might have left rogue child shards behind on this node. + // + // This function must complete _before_ setting a `Node` to Active: once it is set to Active, other + // Reconcilers might communicate with the node, and these must not overlap with the work we do in + // this function. + // + // The reconciliation logic in here is very similar to what [`Self::startup_reconcile`] does, but + // for written for a single node rather than as a batch job for all nodes. + #[tracing::instrument(skip_all, fields(node_id=%node.get_id()))] + async fn node_activate_reconcile( + &self, + mut node: Node, + _lock: &WrappedWriteGuard, + ) -> Result<(), ApiError> { + // This Node is a mutable local copy: we will set it active so that we can use its + // API client to reconcile with the node. The Node in [`Self::nodes`] will get updated + // later. + node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + + let configs = match node + .with_client_retries( + |client| async move { client.list_location_config().await }, + &self.config.jwt_token, + 1, + 5, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + None => { + // We're shutting down (the Node's cancellation token can't have fired, because + // we're the only scope that has a reference to it, and we didn't fire it). + return Err(ApiError::ShuttingDown); + } + Some(Err(e)) => { + // This node didn't succeed listing its locations: it may not proceed to active state + // as it is apparently unavailable. + return Err(ApiError::PreconditionFailed( + format!("Failed to query node location configs, cannot activate ({e})").into(), + )); + } + Some(Ok(configs)) => configs, + }; + tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len()); + + let mut cleanup = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + + for (tenant_shard_id, observed_loc) in configs.tenant_shards { + let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { + cleanup.push(tenant_shard_id); + continue; + }; + tenant_shard + .observed + .locations + .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); + } + } + + for tenant_shard_id in cleanup { + tracing::info!("Detaching {tenant_shard_id}"); + match node + .with_client_retries( + |client| async move { + let config = LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: tenant_shard_id.shard_number.0, + shard_count: tenant_shard_id.shard_count.literal(), + shard_stripe_size: 0, + tenant_conf: models::TenantConfig::default(), + }; + client + .location_config(tenant_shard_id, config, None, false) + .await + }, + &self.config.jwt_token, + 1, + 5, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + None => { + // We're shutting down (the Node's cancellation token can't have fired, because + // we're the only scope that has a reference to it, and we didn't fire it). + return Err(ApiError::ShuttingDown); + } + Some(Err(e)) => { + // Do not let the node proceed to Active state if it is not responsive to requests + // to detach. This could happen if e.g. a shutdown bug in the pageserver is preventing + // detach completing: we should not let this node back into the set of nodes considered + // okay for scheduling. + return Err(ApiError::Conflict(format!( + "Node {node} failed to detach {tenant_shard_id}: {e}" + ))); + } + Some(Ok(_)) => {} + }; + } + + Ok(()) + } + + pub(crate) async fn re_attach( + &self, + reattach_req: ReAttachRequest, + ) -> Result { + if let Some(register_req) = reattach_req.register { + self.node_register(register_req).await?; + } + + // Ordering: we must persist generation number updates before making them visible in the in-memory state + let incremented_generations = self.persistence.re_attach(reattach_req.node_id).await?; + + tracing::info!( + node_id=%reattach_req.node_id, + "Incremented {} tenant shards' generations", + incremented_generations.len() + ); + + // Apply the updated generation to our in-memory state, and + // gather discover secondary locations. + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let mut response = ReAttachResponse { + tenants: Vec::new(), + }; + + // TODO: cancel/restart any running reconciliation for this tenant, it might be trying + // to call location_conf API with an old generation. Wait for cancellation to complete + // before responding to this request. Requires well implemented CancellationToken logic + // all the way to where we call location_conf. Even then, there can still be a location_conf + // request in flight over the network: TODO handle that by making location_conf API refuse + // to go backward in generations. + + // Scan through all shards, applying updates for ones where we updated generation + // and identifying shards that intend to have a secondary location on this node. + for (tenant_shard_id, shard) in tenants { + if let Some(new_gen) = incremented_generations.get(tenant_shard_id) { + let new_gen = *new_gen; + response.tenants.push(ReAttachResponseTenant { + id: *tenant_shard_id, + gen: Some(new_gen.into().unwrap()), + // A tenant is only put into multi or stale modes in the middle of a [`Reconciler::live_migrate`] + // execution. If a pageserver is restarted during that process, then the reconcile pass will + // fail, and start from scratch, so it doesn't make sense for us to try and preserve + // the stale/multi states at this point. + mode: LocationConfigMode::AttachedSingle, + }); + + shard.generation = std::cmp::max(shard.generation, Some(new_gen)); + if let Some(observed) = shard.observed.locations.get_mut(&reattach_req.node_id) { + // Why can we update `observed` even though we're not sure our response will be received + // by the pageserver? Because the pageserver will not proceed with startup until + // it has processed response: if it loses it, we'll see another request and increment + // generation again, avoiding any uncertainty about dirtiness of tenant's state. + if let Some(conf) = observed.conf.as_mut() { + conf.generation = new_gen.into(); + } + } else { + // This node has no observed state for the shard: perhaps it was offline + // when the pageserver restarted. Insert a None, so that the Reconciler + // will be prompted to learn the location's state before it makes changes. + shard + .observed + .locations + .insert(reattach_req.node_id, ObservedStateLocation { conf: None }); + } + } else if shard.intent.get_secondary().contains(&reattach_req.node_id) { + // Ordering: pageserver will not accept /location_config requests until it has + // finished processing the response from re-attach. So we can update our in-memory state + // now, and be confident that we are not stamping on the result of some later location config. + // TODO: however, we are not strictly ordered wrt ReconcileResults queue, + // so we might update observed state here, and then get over-written by some racing + // ReconcileResult. The impact is low however, since we have set state on pageserver something + // that matches intent, so worst case if we race then we end up doing a spurious reconcile. + + response.tenants.push(ReAttachResponseTenant { + id: *tenant_shard_id, + gen: None, + mode: LocationConfigMode::Secondary, + }); + + // We must not update observed, because we have no guarantee that our + // response will be received by the pageserver. This could leave it + // falsely dirty, but the resulting reconcile should be idempotent. + } + } + + // We consider a node Active once we have composed a re-attach response, but we + // do not call [`Self::node_activate_reconcile`]: the handling of the re-attach response + // implicitly synchronizes the LocationConfigs on the node. + // + // Setting a node active unblocks any Reconcilers that might write to the location config API, + // but those requests will not be accepted by the node until it has finished processing + // the re-attach response. + // + // Additionally, reset the nodes scheduling policy to match the conditional update done + // in [`Persistence::re_attach`]. + if let Some(node) = nodes.get(&reattach_req.node_id) { + let reset_scheduling = matches!( + node.get_scheduling(), + NodeSchedulingPolicy::PauseForRestart + | NodeSchedulingPolicy::Draining + | NodeSchedulingPolicy::Filling + ); + + if !node.is_available() || reset_scheduling { + let mut new_nodes = (**nodes).clone(); + if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) { + if !node.is_available() { + node.set_availability(NodeAvailability::Active(UtilizationScore::worst())); + } + + if reset_scheduling { + node.set_scheduling(NodeSchedulingPolicy::Active); + } + + scheduler.node_upsert(node); + let new_nodes = Arc::new(new_nodes); + *nodes = new_nodes; + } + } + } + + Ok(response) + } + + pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse { + let locked = self.inner.read().unwrap(); + + let mut response = ValidateResponse { + tenants: Vec::new(), + }; + + for req_tenant in validate_req.tenants { + if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); + tracing::info!( + "handle_validate: {}(gen {}): valid={valid} (latest {:?})", + req_tenant.id, + req_tenant.gen, + tenant_shard.generation + ); + response.tenants.push(ValidateResponseTenant { + id: req_tenant.id, + valid, + }); + } else { + // After tenant deletion, we may approve any validation. This avoids + // spurious warnings on the pageserver if it has pending LSN updates + // at the point a deletion happens. + response.tenants.push(ValidateResponseTenant { + id: req_tenant.id, + valid: true, + }); + } + } + response + } + + pub(crate) async fn tenant_create( + &self, + create_req: TenantCreateRequest, + ) -> Result { + let tenant_id = create_req.new_tenant_id.tenant_id; + + // Exclude any concurrent attempts to create/access the same tenant ID + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + create_req.new_tenant_id.tenant_id, + TenantOperations::Create, + ) + .await; + let (response, waiters) = self.do_tenant_create(create_req).await?; + + if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + // Avoid deadlock: reconcile may fail while notifying compute, if the cloud control plane refuses to + // accept compute notifications while it is in the process of creating. Reconciliation will + // be retried in the background. + tracing::warn!(%tenant_id, "Reconcile not done yet while creating tenant ({e})"); + } + Ok(response) + } + + pub(crate) async fn do_tenant_create( + &self, + create_req: TenantCreateRequest, + ) -> Result<(TenantCreateResponse, Vec), ApiError> { + let placement_policy = create_req + .placement_policy + .clone() + // As a default, zero secondaries is convenient for tests that don't choose a policy. + .unwrap_or(PlacementPolicy::Attached(0)); + + // This service expects to handle sharding itself: it is an error to try and directly create + // a particular shard here. + let tenant_id = if !create_req.new_tenant_id.is_unsharded() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Attempted to create a specific shard, this API is for creating the whole tenant" + ))); + } else { + create_req.new_tenant_id.tenant_id + }; + + tracing::info!( + "Creating tenant {}, shard_count={:?}", + create_req.new_tenant_id, + create_req.shard_parameters.count, + ); + + let create_ids = (0..create_req.shard_parameters.count.count()) + .map(|i| TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count: create_req.shard_parameters.count, + }) + .collect::>(); + + // If the caller specifies a None generation, it means "start from default". This is different + // to [`Self::tenant_location_config`], where a None generation is used to represent + // an incompletely-onboarded tenant. + let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) { + tracing::info!( + "tenant_create: secondary mode, generation is_some={}", + create_req.generation.is_some() + ); + create_req.generation.map(Generation::new) + } else { + tracing::info!( + "tenant_create: not secondary mode, generation is_some={}", + create_req.generation.is_some() + ); + Some( + create_req + .generation + .map(Generation::new) + .unwrap_or(INITIAL_GENERATION), + ) + }; + + // Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller + // to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart + // during the creation, rather than risking leaving orphan objects in S3. + let persist_tenant_shards = create_ids + .iter() + .map(|tenant_shard_id| TenantShardPersistence { + tenant_id: tenant_shard_id.tenant_id.to_string(), + shard_number: tenant_shard_id.shard_number.0 as i32, + shard_count: tenant_shard_id.shard_count.literal() as i32, + shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32, + generation: initial_generation.map(|g| g.into().unwrap() as i32), + // The pageserver is not known until scheduling happens: we will set this column when + // incrementing the generation the first time we attach to a pageserver. + generation_pageserver: None, + placement_policy: serde_json::to_string(&placement_policy).unwrap(), + config: serde_json::to_string(&create_req.config).unwrap(), + splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), + }) + .collect(); + + match self + .persistence + .insert_tenant_shards(persist_tenant_shards) + .await + { + Ok(_) => {} + Err(DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + ))) => { + // Unique key violation: this is probably a retry. Because the shard count is part of the unique key, + // if we see a unique key violation it means that the creation request's shard count matches the previous + // creation's shard count. + tracing::info!("Tenant shards already present in database, proceeding with idempotent creation..."); + } + // Any other database error is unexpected and a bug. + Err(e) => return Err(ApiError::InternalServerError(anyhow::anyhow!(e))), + }; + + let mut schedule_context = ScheduleContext::default(); + + let (waiters, response_shards) = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let mut response_shards = Vec::new(); + let mut schcedule_error = None; + + for tenant_shard_id in create_ids { + tracing::info!("Creating shard {tenant_shard_id}..."); + + use std::collections::btree_map::Entry; + match tenants.entry(tenant_shard_id) { + Entry::Occupied(mut entry) => { + tracing::info!( + "Tenant shard {tenant_shard_id} already exists while creating" + ); + + // TODO: schedule() should take an anti-affinity expression that pushes + // attached and secondary locations (independently) away frorm those + // pageservers also holding a shard for this tenant. + + entry + .get_mut() + .schedule(scheduler, &mut schedule_context) + .map_err(|e| { + ApiError::Conflict(format!( + "Failed to schedule shard {tenant_shard_id}: {e}" + )) + })?; + + if let Some(node_id) = entry.get().intent.get_attached() { + let generation = entry + .get() + .generation + .expect("Generation is set when in attached mode"); + response_shards.push(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }); + } + + continue; + } + Entry::Vacant(entry) => { + let state = entry.insert(TenantShard::new( + tenant_shard_id, + ShardIdentity::from_params( + tenant_shard_id.shard_number, + &create_req.shard_parameters, + ), + placement_policy.clone(), + )); + + state.generation = initial_generation; + state.config = create_req.config.clone(); + if let Err(e) = state.schedule(scheduler, &mut schedule_context) { + schcedule_error = Some(e); + } + + // Only include shards in result if we are attaching: the purpose + // of the response is to tell the caller where the shards are attached. + if let Some(node_id) = state.intent.get_attached() { + let generation = state + .generation + .expect("Generation is set when in attached mode"); + response_shards.push(TenantCreateResponseShard { + shard_id: tenant_shard_id, + node_id: *node_id, + generation: generation.into().unwrap(), + }); + } + } + }; + } + + // If we failed to schedule shards, then they are still created in the controller, + // but we return an error to the requester to avoid a silent failure when someone + // tries to e.g. create a tenant whose placement policy requires more nodes than + // are present in the system. We do this here rather than in the above loop, to + // avoid situations where we only create a subset of shards in the tenant. + if let Some(e) = schcedule_error { + return Err(ApiError::Conflict(format!( + "Failed to schedule shard(s): {e}" + ))); + } + + let waiters = tenants + .range_mut(TenantShardId::tenant_range(tenant_id)) + .filter_map(|(_shard_id, shard)| self.maybe_reconcile_shard(shard, nodes)) + .collect::>(); + (waiters, response_shards) + }; + + Ok(( + TenantCreateResponse { + shards: response_shards, + }, + waiters, + )) + } + + /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded + /// wait for reconciliation to complete before responding. + async fn await_waiters( + &self, + waiters: Vec, + timeout: Duration, + ) -> Result<(), ReconcileWaitError> { + let deadline = Instant::now().checked_add(timeout).unwrap(); + for waiter in waiters { + let timeout = deadline.duration_since(Instant::now()); + waiter.wait_timeout(timeout).await?; + } + + Ok(()) + } + + /// Same as [`Service::await_waiters`], but returns the waiters which are still + /// in progress + async fn await_waiters_remainder( + &self, + waiters: Vec, + timeout: Duration, + ) -> Vec { + let deadline = Instant::now().checked_add(timeout).unwrap(); + for waiter in waiters.iter() { + let timeout = deadline.duration_since(Instant::now()); + let _ = waiter.wait_timeout(timeout).await; + } + + waiters + .into_iter() + .filter(|waiter| matches!(waiter.get_status(), ReconcilerStatus::InProgress)) + .collect::>() + } + + /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request, + /// and transform it into either a tenant creation of a series of shard updates. + /// + /// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will + /// still be returned. + fn tenant_location_config_prepare( + &self, + tenant_id: TenantId, + req: TenantLocationConfigRequest, + ) -> TenantCreateOrUpdate { + let mut updates = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + + // Use location config mode as an indicator of policy. + let placement_policy = match req.config.mode { + LocationConfigMode::Detached => PlacementPolicy::Detached, + LocationConfigMode::Secondary => PlacementPolicy::Secondary, + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale => { + if nodes.len() > 1 { + PlacementPolicy::Attached(1) + } else { + // Convenience for dev/test: if we just have one pageserver, import + // tenants into non-HA mode so that scheduling will succeed. + PlacementPolicy::Attached(0) + } + } + }; + + let mut create = true; + for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + // Saw an existing shard: this is not a creation + create = false; + + // Shards may have initially been created by a Secondary request, where we + // would have left generation as None. + // + // We only update generation the first time we see an attached-mode request, + // and if there is no existing generation set. The caller is responsible for + // ensuring that no non-storage-controller pageserver ever uses a higher + // generation than they passed in here. + use LocationConfigMode::*; + let set_generation = match req.config.mode { + AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => { + req.config.generation.map(Generation::new) + } + _ => None, + }; + + updates.push(ShardUpdate { + tenant_shard_id: *shard_id, + placement_policy: placement_policy.clone(), + tenant_config: req.config.tenant_conf.clone(), + generation: set_generation, + }); + } + + if create { + use LocationConfigMode::*; + let generation = match req.config.mode { + AttachedMulti | AttachedSingle | AttachedStale => req.config.generation, + // If a caller provided a generation in a non-attached request, ignore it + // and leave our generation as None: this enables a subsequent update to set + // the generation when setting an attached mode for the first time. + _ => None, + }; + + TenantCreateOrUpdate::Create( + // Synthesize a creation request + TenantCreateRequest { + new_tenant_id: tenant_shard_id, + generation, + shard_parameters: ShardParameters { + count: tenant_shard_id.shard_count, + // We only import un-sharded or single-sharded tenants, so stripe + // size can be made up arbitrarily here. + stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, + }, + placement_policy: Some(placement_policy), + config: req.config.tenant_conf, + }, + ) + } else { + assert!(!updates.is_empty()); + TenantCreateOrUpdate::Update(updates) + } + } + + /// This API is used by the cloud control plane to migrate unsharded tenants that it created + /// directly with pageservers into this service. + /// + /// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it + /// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption. + /// Think of the first attempt to call this API as a transfer of absolute authority over the + /// tenant's source of generation numbers. + /// + /// The mode in this request coarse-grained control of tenants: + /// - Call with mode Attached* to upsert the tenant. + /// - Call with mode Secondary to either onboard a tenant without attaching it, or + /// to set an existing tenant to PolicyMode::Secondary + /// - Call with mode Detached to switch to PolicyMode::Detached + pub(crate) async fn tenant_location_config( + &self, + tenant_shard_id: TenantShardId, + req: TenantLocationConfigRequest, + ) -> Result { + // We require an exclusive lock, because we are updating both persistent and in-memory state + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_shard_id.tenant_id, + TenantOperations::LocationConfig, + ) + .await; + + if !tenant_shard_id.is_unsharded() { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "This API is for importing single-sharded or unsharded tenants" + ))); + } + + // First check if this is a creation or an update + let create_or_update = self.tenant_location_config_prepare(tenant_shard_id.tenant_id, req); + + let mut result = TenantLocationConfigResponse { + shards: Vec::new(), + stripe_size: None, + }; + let waiters = match create_or_update { + TenantCreateOrUpdate::Create(create_req) => { + let (create_resp, waiters) = self.do_tenant_create(create_req).await?; + result.shards = create_resp + .shards + .into_iter() + .map(|s| TenantShardLocation { + node_id: s.node_id, + shard_id: s.shard_id, + }) + .collect(); + waiters + } + TenantCreateOrUpdate::Update(updates) => { + // Persist updates + // Ordering: write to the database before applying changes in-memory, so that + // we will not appear time-travel backwards on a restart. + let mut schedule_context = ScheduleContext::default(); + for ShardUpdate { + tenant_shard_id, + placement_policy, + tenant_config, + generation, + } in &updates + { + self.persistence + .update_tenant_shard( + TenantFilter::Shard(*tenant_shard_id), + Some(placement_policy.clone()), + Some(tenant_config.clone()), + *generation, + None, + ) + .await?; + } + + // Apply updates in-memory + let mut waiters = Vec::new(); + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + for ShardUpdate { + tenant_shard_id, + placement_policy, + tenant_config, + generation: update_generation, + } in updates + { + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + tracing::warn!("Shard {tenant_shard_id} removed while updating"); + continue; + }; + + // Update stripe size + if result.stripe_size.is_none() && shard.shard.count.count() > 1 { + result.stripe_size = Some(shard.shard.stripe_size); + } + + shard.policy = placement_policy; + shard.config = tenant_config; + if let Some(generation) = update_generation { + shard.generation = Some(generation); + } + + shard.schedule(scheduler, &mut schedule_context)?; + + let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); + if let Some(waiter) = maybe_waiter { + waiters.push(waiter); + } + + if let Some(node_id) = shard.intent.get_attached() { + result.shards.push(TenantShardLocation { + shard_id: tenant_shard_id, + node_id: *node_id, + }) + } + } + } + waiters + } + }; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Do not treat a reconcile error as fatal: we have already applied any requested + // Intent changes, and the reconcile can fail for external reasons like unavailable + // compute notification API. In these cases, it is important that we do not + // cause the cloud control plane to retry forever on this API. + tracing::warn!( + "Failed to reconcile after /location_config: {e}, returning success anyway" + ); + } + + // Logging the full result is useful because it lets us cross-check what the cloud control + // plane's tenant_shards table should contain. + tracing::info!("Complete, returning {result:?}"); + + Ok(result) + } + + pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> { + // We require an exclusive lock, because we are updating persistent and in-memory state + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + req.tenant_id, + TenantOperations::ConfigSet, + ) + .await; + + let tenant_id = req.tenant_id; + let config = req.config; + + self.persistence + .update_tenant_shard( + TenantFilter::Tenant(req.tenant_id), + None, + Some(config.clone()), + None, + None, + ) + .await?; + + let waiters = { + let mut waiters = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + shard.config = config.clone(); + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + waiters.push(waiter); + } + } + waiters + }; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Treat this as success because we have stored the configuration. If e.g. + // a node was unavailable at this time, it should not stop us accepting a + // configuration change. + tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}"); + } + + Ok(()) + } + + pub(crate) fn tenant_config_get( + &self, + tenant_id: TenantId, + ) -> Result, ApiError> { + let config = { + let locked = self.inner.read().unwrap(); + + match locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + { + Some((_tenant_shard_id, shard)) => shard.config.clone(), + None => { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )) + } + } + }; + + // Unlike the pageserver, we do not have a set of global defaults: the config is + // entirely per-tenant. Therefore the distinction between `tenant_specific_overrides` + // and `effective_config` in the response is meaningless, but we retain that syntax + // in order to remain compatible with the pageserver API. + + let response = HashMap::from([ + ( + "tenant_specific_overrides", + serde_json::to_value(&config) + .context("serializing tenant specific overrides") + .map_err(ApiError::InternalServerError)?, + ), + ( + "effective_config", + serde_json::to_value(&config) + .context("serializing effective config") + .map_err(ApiError::InternalServerError)?, + ), + ]); + + Ok(response) + } + + pub(crate) async fn tenant_time_travel_remote_storage( + &self, + time_travel_req: &TenantTimeTravelRequest, + tenant_id: TenantId, + timestamp: Cow<'_, str>, + done_if_after: Cow<'_, str>, + ) -> Result<(), ApiError> { + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimeTravelRemoteStorage, + ) + .await; + + let node = { + let locked = self.inner.read().unwrap(); + // Just a sanity check to prevent misuse: the API expects that the tenant is fully + // detached everywhere, and nothing writes to S3 storage. Here, we verify that, + // but only at the start of the process, so it's really just to prevent operator + // mistakes. + for (shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { + if shard.intent.get_attached().is_some() || !shard.intent.get_secondary().is_empty() + { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "We want tenant to be attached in shard with tenant_shard_id={shard_id}" + ))); + } + let maybe_attached = shard + .observed + .locations + .iter() + .filter_map(|(node_id, observed_location)| { + observed_location + .conf + .as_ref() + .map(|loc| (node_id, observed_location, loc.mode)) + }) + .find(|(_, _, mode)| *mode != LocationConfigMode::Detached); + if let Some((node_id, _observed_location, mode)) = maybe_attached { + return Err(ApiError::InternalServerError(anyhow::anyhow!("We observed attached={mode:?} tenant in node_id={node_id} shard with tenant_shard_id={shard_id}"))); + } + } + let scheduler = &locked.scheduler; + // Right now we only perform the operation on a single node without parallelization + // TODO fan out the operation to multiple nodes for better performance + let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while lock is active"); + node.clone() + }; + + // The shard count is encoded in the remote storage's URL, so we need to handle all historically used shard counts + let mut counts = time_travel_req + .shard_counts + .iter() + .copied() + .collect::>() + .into_iter() + .collect::>(); + counts.sort_unstable(); + + for count in counts { + let shard_ids = (0..count.count()) + .map(|i| TenantShardId { + tenant_id, + shard_number: ShardNumber(i), + shard_count: count, + }) + .collect::>(); + for tenant_shard_id in shard_ids { + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + + tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); + + client + .tenant_time_travel_remote_storage( + tenant_shard_id, + ×tamp, + &done_if_after, + ) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error doing time travel recovery for shard {tenant_shard_id} on node {}: {e}", + node + )) + })?; + } + } + Ok(()) + } + + pub(crate) async fn tenant_secondary_download( + &self, + tenant_id: TenantId, + wait: Option, + ) -> Result<(StatusCode, SecondaryProgress), ApiError> { + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::SecondaryDownload, + ) + .await; + + // Acquire lock and yield the collection of shard-node tuples which we will send requests onward to + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + for node_id in shard.intent.get_secondary() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + } + targets + }; + + // Issue concurrent requests to all shards' locations + let mut futs = FuturesUnordered::new(); + for (tenant_shard_id, node) in targets { + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + futs.push(async move { + let result = client + .tenant_secondary_download(tenant_shard_id, wait) + .await; + (result, node, tenant_shard_id) + }) + } + + // Handle any errors returned by pageservers. This includes cases like this request racing with + // a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as + // well as more general cases like 503s, 500s, or timeouts. + let mut aggregate_progress = SecondaryProgress::default(); + let mut aggregate_status: Option = None; + let mut error: Option = None; + while let Some((result, node, tenant_shard_id)) = futs.next().await { + match result { + Err(e) => { + // Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever + // is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache + // than they had hoped for. + tracing::warn!("Secondary download error from pageserver {node}: {e}",); + error = Some(e) + } + Ok((status_code, progress)) => { + tracing::info!(%tenant_shard_id, "Shard status={status_code} progress: {progress:?}"); + aggregate_progress.layers_downloaded += progress.layers_downloaded; + aggregate_progress.layers_total += progress.layers_total; + aggregate_progress.bytes_downloaded += progress.bytes_downloaded; + aggregate_progress.bytes_total += progress.bytes_total; + aggregate_progress.heatmap_mtime = + std::cmp::max(aggregate_progress.heatmap_mtime, progress.heatmap_mtime); + aggregate_status = match aggregate_status { + None => Some(status_code), + Some(StatusCode::OK) => Some(status_code), + Some(cur) => { + // Other status codes (e.g. 202) -- do not overwrite. + Some(cur) + } + }; + } + } + } + + // If any of the shards return 202, indicate our result as 202. + match aggregate_status { + None => { + match error { + Some(e) => { + // No successes, and an error: surface it + Err(ApiError::Conflict(format!("Error from pageserver: {e}"))) + } + None => { + // No shards found + Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", tenant_id).into(), + )) + } + } + } + Some(aggregate_status) => Ok((aggregate_status, aggregate_progress)), + } + } + + pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result { + let _tenant_lock = + trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await; + + // Detach all shards + let (detach_waiters, shard_ids, node) = { + let mut shard_ids = Vec::new(); + let mut detach_waiters = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (tenant_shard_id, shard) in + tenants.range_mut(TenantShardId::tenant_range(tenant_id)) + { + shard_ids.push(*tenant_shard_id); + + // Update the tenant's intent to remove all attachments + shard.policy = PlacementPolicy::Detached; + shard + .schedule(scheduler, &mut ScheduleContext::default()) + .expect("De-scheduling is infallible"); + debug_assert!(shard.intent.get_attached().is_none()); + debug_assert!(shard.intent.get_secondary().is_empty()); + + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + detach_waiters.push(waiter); + } + } + + // Pick an arbitrary node to use for remote deletions (does not have to be where the tenant + // was attached, just has to be able to see the S3 content) + let node_id = scheduler.schedule_shard(&[], &ScheduleContext::default())?; + let node = nodes + .get(&node_id) + .expect("Pageservers may not be deleted while lock is active"); + (detach_waiters, shard_ids, node.clone()) + }; + + // This reconcile wait can fail in a few ways: + // A there is a very long queue for the reconciler semaphore + // B some pageserver is failing to handle a detach promptly + // C some pageserver goes offline right at the moment we send it a request. + // + // A and C are transient: the semaphore will eventually become available, and once a node is marked offline + // the next attempt to reconcile will silently skip detaches for an offline node and succeed. If B happens, + // it's a bug, and needs resolving at the pageserver level (we shouldn't just leave attachments behind while + // deleting the underlying data). + self.await_waiters(detach_waiters, RECONCILE_TIMEOUT) + .await?; + + let locations = shard_ids + .into_iter() + .map(|s| (s, node.clone())) + .collect::>(); + let results = self.tenant_for_shards_api( + locations, + |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await }, + 1, + 3, + RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + for result in results { + match result { + Ok(StatusCode::ACCEPTED) => { + // This should never happen: we waited for detaches to finish above + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Unexpectedly still attached on {}", + node + ))); + } + Ok(_) => {} + Err(mgmt_api::Error::Cancelled) => { + return Err(ApiError::ShuttingDown); + } + Err(e) => { + // This is unexpected: remote deletion should be infallible, unless the object store + // at large is unavailable. + tracing::error!("Error deleting via node {}: {e}", node); + return Err(ApiError::InternalServerError(anyhow::anyhow!(e))); + } + } + } + + // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop + // our in-memory state and database state. + + // Ordering: we delete persistent state first: if we then + // crash, we will drop the in-memory state. + + // Drop persistent state. + self.persistence.delete_tenant(tenant_id).await?; + + // Drop in-memory state + { + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + + // Dereference Scheduler from shards before dropping them + for (_tenant_shard_id, shard) in + tenants.range_mut(TenantShardId::tenant_range(tenant_id)) + { + shard.intent.clear(scheduler); + } + + tenants.retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id); + tracing::info!( + "Deleted tenant {tenant_id}, now have {} tenants", + locked.tenants.len() + ); + }; + + // Success is represented as 404, to imitate the existing pageserver deletion API + Ok(StatusCode::NOT_FOUND) + } + + /// Naming: this configures the storage controller's policies for a tenant, whereas [`Self::tenant_config_set`] is "set the TenantConfig" + /// for a tenant. The TenantConfig is passed through to pageservers, whereas this function modifies + /// the tenant's policies (configuration) within the storage controller + pub(crate) async fn tenant_update_policy( + &self, + tenant_id: TenantId, + req: TenantPolicyRequest, + ) -> Result<(), ApiError> { + // We require an exclusive lock, because we are updating persistent and in-memory state + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::UpdatePolicy, + ) + .await; + + failpoint_support::sleep_millis_async!("tenant-update-policy-exclusive-lock"); + + let TenantPolicyRequest { + placement, + scheduling, + } = req; + + self.persistence + .update_tenant_shard( + TenantFilter::Tenant(tenant_id), + placement.clone(), + None, + None, + scheduling, + ) + .await?; + + let mut schedule_context = ScheduleContext::default(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + if let Some(placement) = &placement { + shard.policy = placement.clone(); + + tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(), + "Updated placement policy to {placement:?}"); + } + + if let Some(scheduling) = &scheduling { + shard.set_scheduling_policy(*scheduling); + + tracing::info!(tenant_id=%shard_id.tenant_id, shard_id=%shard_id.shard_slug(), + "Updated scheduling policy to {scheduling:?}"); + } + + // In case scheduling is being switched back on, try it now. + shard.schedule(scheduler, &mut schedule_context).ok(); + self.maybe_reconcile_shard(shard, nodes); + } + + Ok(()) + } + + pub(crate) async fn tenant_timeline_create( + &self, + tenant_id: TenantId, + mut create_req: TimelineCreateRequest, + ) -> Result { + tracing::info!( + "Creating timeline {}/{}", + tenant_id, + create_req.new_timeline_id, + ); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineCreate, + ) + .await; + + self.ensure_attached_wait(tenant_id).await?; + + let mut targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = shard.intent.get_attached().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) + })?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + targets + }; + + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + }; + let shard_zero = targets.remove(0); + + async fn create_one( + tenant_shard_id: TenantShardId, + node: Node, + jwt: Option, + create_req: TimelineCreateRequest, + ) -> Result { + tracing::info!( + "Creating timeline on shard {}/{}, attached to node {node}", + tenant_shard_id, + create_req.new_timeline_id, + ); + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + + client + .timeline_create(tenant_shard_id, &create_req) + .await + .map_err(|e| passthrough_api_error(&node, e)) + } + + // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then + // use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard + // that will get the first creation request, and propagate the LSN to all the >0 shards. + let timeline_info = create_one( + shard_zero.0, + shard_zero.1, + self.config.jwt_token.clone(), + create_req.clone(), + ) + .await?; + + // Propagate the LSN that shard zero picked, if caller didn't provide one + if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none() { + create_req.ancestor_start_lsn = timeline_info.ancestor_lsn; + } + + // Create timeline on remaining shards with number >0 + if !targets.is_empty() { + // If we had multiple shards, issue requests for the remainder now. + let jwt = self.config.jwt_token.clone(); + self.tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { + let create_req = create_req.clone(); + Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) + }) + .await?; + } + + Ok(timeline_info) + } + + /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. + /// + /// On success, the returned vector contains exactly the same number of elements as the input `locations`. + async fn tenant_for_shards( + &self, + locations: Vec<(TenantShardId, Node)>, + mut req_fn: F, + ) -> Result, ApiError> + where + F: FnMut( + TenantShardId, + Node, + ) + -> std::pin::Pin> + Send>>, + { + let mut futs = FuturesUnordered::new(); + let mut results = Vec::with_capacity(locations.len()); + + for (tenant_shard_id, node) in locations { + futs.push(req_fn(tenant_shard_id, node)); + } + + while let Some(r) = futs.next().await { + results.push(r?); + } + + Ok(results) + } + + /// Concurrently invoke a pageserver API call on many shards at once + pub(crate) async fn tenant_for_shards_api( + &self, + locations: Vec<(TenantShardId, Node)>, + op: O, + warn_threshold: u32, + max_retries: u32, + timeout: Duration, + cancel: &CancellationToken, + ) -> Vec> + where + O: Fn(TenantShardId, PageserverClient) -> F + Copy, + F: std::future::Future>, + { + let mut futs = FuturesUnordered::new(); + let mut results = Vec::with_capacity(locations.len()); + + for (tenant_shard_id, node) in locations { + futs.push(async move { + node.with_client_retries( + |client| op(tenant_shard_id, client), + &self.config.jwt_token, + warn_threshold, + max_retries, + timeout, + cancel, + ) + .await + }); + } + + while let Some(r) = futs.next().await { + let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled)); + results.push(r); + } + + results + } + + pub(crate) async fn tenant_timeline_delete( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result { + tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,); + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineDelete, + ) + .await; + + self.ensure_attached_wait(tenant_id).await?; + + let mut targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = shard.intent.get_attached().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled")) + })?; + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push((*tenant_shard_id, node.clone())); + } + targets + }; + + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + let shard_zero = targets.remove(0); + + async fn delete_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + ) -> Result { + tracing::info!( + "Deleting timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client + .timeline_delete(tenant_shard_id, timeline_id) + .await + .map_err(|e| { + ApiError::InternalServerError(anyhow::anyhow!( + "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}", + )) + }) + } + + let statuses = self + .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { + Box::pin(delete_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + )) + }) + .await?; + + // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero + if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) { + return Ok(StatusCode::ACCEPTED); + } + + // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed + // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done. + let shard_zero_status = delete_one( + shard_zero.0, + timeline_id, + shard_zero.1, + self.config.jwt_token.clone(), + ) + .await?; + + Ok(shard_zero_status) + } + + /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this + /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound) + pub(crate) fn tenant_shard0_node( + &self, + tenant_id: TenantId, + ) -> Result<(Node, TenantShardId), ApiError> { + let locked = self.inner.read().unwrap(); + let Some((tenant_shard_id, shard)) = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .next() + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {tenant_id} not found").into(), + )); + }; + + // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might + // point to somewhere we haven't attached yet. + let Some(node_id) = shard.intent.get_attached() else { + tracing::warn!( + tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), + "Shard not scheduled (policy {:?}), cannot generate pass-through URL", + shard.policy + ); + return Err(ApiError::Conflict( + "Cannot call timeline API on non-attached tenant".to_string(), + )); + }; + + let Some(node) = locked.nodes.get(node_id) else { + // This should never happen + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard refers to nonexistent node" + ))); + }; + + Ok((node.clone(), *tenant_shard_id)) + } + + pub(crate) fn tenant_locate( + &self, + tenant_id: TenantId, + ) -> Result { + let locked = self.inner.read().unwrap(); + tracing::info!("Locating shards for tenant {tenant_id}"); + + let mut result = Vec::new(); + let mut shard_params: Option = None; + + for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let node_id = + shard + .intent + .get_attached() + .ok_or(ApiError::BadRequest(anyhow::anyhow!( + "Cannot locate a tenant that is not attached" + )))?; + + let node = locked + .nodes + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + result.push(node.shard_location(*tenant_shard_id)); + + match &shard_params { + None => { + shard_params = Some(ShardParameters { + stripe_size: shard.shard.stripe_size, + count: shard.shard.count, + }); + } + Some(params) => { + if params.stripe_size != shard.shard.stripe_size { + // This should never happen. We enforce at runtime because it's simpler than + // adding an extra per-tenant data structure to store the things that should be the same + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Inconsistent shard stripe size parameters!" + ))); + } + } + } + } + + if result.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards for this tenant ID found").into(), + )); + } + let shard_params = shard_params.expect("result is non-empty, therefore this is set"); + tracing::info!( + "Located tenant {} with params {:?} on shards {}", + tenant_id, + shard_params, + result + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + + Ok(TenantLocateResponse { + shards: result, + shard_params, + }) + } + + /// Returns None if the input iterator of shards does not include a shard with number=0 + fn tenant_describe_impl<'a>( + &self, + shards: impl Iterator, + ) -> Option { + let mut shard_zero = None; + let mut describe_shards = Vec::new(); + + for shard in shards { + if shard.tenant_shard_id.is_shard_zero() { + shard_zero = Some(shard); + } + + describe_shards.push(TenantDescribeResponseShard { + tenant_shard_id: shard.tenant_shard_id, + node_attached: *shard.intent.get_attached(), + node_secondary: shard.intent.get_secondary().to_vec(), + last_error: shard + .last_error + .lock() + .unwrap() + .as_ref() + .map(|e| format!("{e}")) + .unwrap_or("".to_string()) + .clone(), + is_reconciling: shard.reconciler.is_some(), + is_pending_compute_notification: shard.pending_compute_notification, + is_splitting: matches!(shard.splitting, SplitState::Splitting), + scheduling_policy: *shard.get_scheduling_policy(), + }) + } + + let shard_zero = shard_zero?; + + Some(TenantDescribeResponse { + tenant_id: shard_zero.tenant_shard_id.tenant_id, + shards: describe_shards, + stripe_size: shard_zero.shard.stripe_size, + policy: shard_zero.policy.clone(), + config: shard_zero.config.clone(), + }) + } + + pub(crate) fn tenant_describe( + &self, + tenant_id: TenantId, + ) -> Result { + let locked = self.inner.read().unwrap(); + + self.tenant_describe_impl( + locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(_k, v)| v), + ) + .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into())) + } + + pub(crate) fn tenant_list(&self) -> Vec { + let locked = self.inner.read().unwrap(); + + let mut result = Vec::new(); + for (_tenant_id, tenant_shards) in + &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id) + { + result.push( + self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v)) + .expect("Groups are always non-empty"), + ); + } + + result + } + + #[instrument(skip_all, fields(tenant_id=%op.tenant_id))] + async fn abort_tenant_shard_split( + &self, + op: &TenantShardSplitAbort, + ) -> Result<(), TenantShardSplitAbortError> { + // Cleaning up a split: + // - Parent shards are not destroyed during a split, just detached. + // - Failed pageserver split API calls can leave the remote node with just the parent attached, + // just the children attached, or both. + // + // Therefore our work to do is to: + // 1. Clean up storage controller's internal state to just refer to parents, no children + // 2. Call out to pageservers to ensure that children are detached + // 3. Call out to pageservers to ensure that parents are attached. + // + // Crash safety: + // - If the storage controller stops running during this cleanup *after* clearing the splitting state + // from our database, then [`Self::startup_reconcile`] will regard child attachments as garbage + // and detach them. + // - TODO: If the storage controller stops running during this cleanup *before* clearing the splitting state + // from our database, then we will re-enter this cleanup routine on startup. + + let TenantShardSplitAbort { + tenant_id, + new_shard_count, + new_stripe_size, + .. + } = op; + + // First abort persistent state, if any exists. + match self + .persistence + .abort_shard_split(*tenant_id, *new_shard_count) + .await? + { + AbortShardSplitStatus::Aborted => { + // Proceed to roll back any child shards created on pageservers + } + AbortShardSplitStatus::Complete => { + // The split completed (we might hit that path if e.g. our database transaction + // to write the completion landed in the database, but we dropped connection + // before seeing the result). + // + // We must update in-memory state to reflect the successful split. + self.tenant_shard_split_commit_inmem( + *tenant_id, + *new_shard_count, + *new_stripe_size, + ); + return Ok(()); + } + } + + // Clean up in-memory state, and accumulate the list of child locations that need detaching + let detach_locations: Vec<(Node, TenantShardId)> = { + let mut detach_locations = Vec::new(); + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + for (tenant_shard_id, shard) in + tenants.range_mut(TenantShardId::tenant_range(op.tenant_id)) + { + if shard.shard.count == op.new_shard_count { + // Surprising: the phase of [`Self::do_tenant_shard_split`] which inserts child shards in-memory + // is infallible, so if we got an error we shouldn't have got that far. + tracing::warn!( + "During split abort, child shard {tenant_shard_id} found in-memory" + ); + continue; + } + + // Add the children of this shard to this list of things to detach + if let Some(node_id) = shard.intent.get_attached() { + for child_id in tenant_shard_id.split(*new_shard_count) { + detach_locations.push(( + nodes + .get(node_id) + .expect("Intent references nonexistent node") + .clone(), + child_id, + )); + } + } else { + tracing::warn!( + "During split abort, shard {tenant_shard_id} has no attached location" + ); + } + + tracing::info!("Restoring parent shard {tenant_shard_id}"); + shard.splitting = SplitState::Idle; + if let Err(e) = shard.schedule(scheduler, &mut ScheduleContext::default()) { + // If this shard can't be scheduled now (perhaps due to offline nodes or + // capacity issues), that must not prevent us rolling back a split. In this + // case it should be eventually scheduled in the background. + tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}") + } + + self.maybe_reconcile_shard(shard, nodes); + } + + // We don't expect any new_shard_count shards to exist here, but drop them just in case + tenants.retain(|_id, s| s.shard.count != *new_shard_count); + + detach_locations + }; + + for (node, child_id) in detach_locations { + if !node.is_available() { + // An unavailable node cannot be cleaned up now: to avoid blocking forever, we will permit this, and + // rely on the reconciliation that happens when a node transitions to Active to clean up. Since we have + // removed child shards from our in-memory state and database, the reconciliation will implicitly remove + // them from the node. + tracing::warn!("Node {node} unavailable, can't clean up during split abort. It will be cleaned up when it is reactivated."); + continue; + } + + // Detach the remote child. If the pageserver split API call is still in progress, this call will get + // a 503 and retry, up to our limit. + tracing::info!("Detaching {child_id} on {node}..."); + match node + .with_client_retries( + |client| async move { + let config = LocationConfig { + mode: LocationConfigMode::Detached, + generation: None, + secondary_conf: None, + shard_number: child_id.shard_number.0, + shard_count: child_id.shard_count.literal(), + // Stripe size and tenant config don't matter when detaching + shard_stripe_size: 0, + tenant_conf: TenantConfig::default(), + }; + + client.location_config(child_id, config, None, false).await + }, + &self.config.jwt_token, + 1, + 10, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(_)) => {} + Some(Err(e)) => { + // We failed to communicate with the remote node. This is problematic: we may be + // leaving it with a rogue child shard. + tracing::warn!( + "Failed to detach child {child_id} from node {node} during abort" + ); + return Err(e.into()); + } + None => { + // Cancellation: we were shutdown or the node went offline. Shutdown is fine, we'll + // clean up on restart. The node going offline requires a retry. + return Err(TenantShardSplitAbortError::Unavailable); + } + }; + } + + tracing::info!("Successfully aborted split"); + Ok(()) + } + + /// Infallible final stage of [`Self::tenant_shard_split`]: update the contents + /// of the tenant map to reflect the child shards that exist after the split. + fn tenant_shard_split_commit_inmem( + &self, + tenant_id: TenantId, + new_shard_count: ShardCount, + new_stripe_size: Option, + ) -> ( + TenantShardSplitResponse, + Vec<(TenantShardId, NodeId, ShardStripeSize)>, + Vec, + ) { + let mut response = TenantShardSplitResponse { + new_shards: Vec::new(), + }; + let mut child_locations = Vec::new(); + let mut waiters = Vec::new(); + + { + let mut locked = self.inner.write().unwrap(); + + let parent_ids = locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(shard_id, _)| *shard_id) + .collect::>(); + + let (nodes, tenants, scheduler) = locked.parts_mut(); + for parent_id in parent_ids { + let child_ids = parent_id.split(new_shard_count); + + let (pageserver, generation, policy, parent_ident, config) = { + let mut old_state = tenants + .remove(&parent_id) + .expect("It was present, we just split it"); + + // A non-splitting state is impossible, because [`Self::tenant_shard_split`] holds + // a TenantId lock and passes it through to [`TenantShardSplitAbort`] in case of cleanup: + // nothing else can clear this. + assert!(matches!(old_state.splitting, SplitState::Splitting)); + + let old_attached = old_state.intent.get_attached().unwrap(); + old_state.intent.clear(scheduler); + let generation = old_state.generation.expect("Shard must have been attached"); + ( + old_attached, + generation, + old_state.policy, + old_state.shard, + old_state.config, + ) + }; + + let mut schedule_context = ScheduleContext::default(); + for child in child_ids { + let mut child_shard = parent_ident; + child_shard.number = child.shard_number; + child_shard.count = child.shard_count; + if let Some(stripe_size) = new_stripe_size { + child_shard.stripe_size = stripe_size; + } + + let mut child_observed: HashMap = HashMap::new(); + child_observed.insert( + pageserver, + ObservedStateLocation { + conf: Some(attached_location_conf( + generation, + &child_shard, + &config, + matches!(policy, PlacementPolicy::Attached(n) if n > 0), + )), + }, + ); + + let mut child_state = TenantShard::new(child, child_shard, policy.clone()); + child_state.intent = IntentState::single(scheduler, Some(pageserver)); + child_state.observed = ObservedState { + locations: child_observed, + }; + child_state.generation = Some(generation); + child_state.config = config.clone(); + + // The child's TenantShard::splitting is intentionally left at the default value of Idle, + // as at this point in the split process we have succeeded and this part is infallible: + // we will never need to do any special recovery from this state. + + child_locations.push((child, pageserver, child_shard.stripe_size)); + + if let Err(e) = child_state.schedule(scheduler, &mut schedule_context) { + // This is not fatal, because we've implicitly already got an attached + // location for the child shard. Failure here just means we couldn't + // find a secondary (e.g. because cluster is overloaded). + tracing::warn!("Failed to schedule child shard {child}: {e}"); + } + // In the background, attach secondary locations for the new shards + if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) { + waiters.push(waiter); + } + + tenants.insert(child, child_state); + response.new_shards.push(child); + } + } + (response, child_locations, waiters) + } + } + + async fn tenant_shard_split_start_secondaries( + &self, + tenant_id: TenantId, + waiters: Vec, + ) { + // Wait for initial reconcile of child shards, this creates the secondary locations + if let Err(e) = self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + // This is not a failure to split: it's some issue reconciling the new child shards, perhaps + // their secondaries couldn't be attached. + tracing::warn!("Failed to reconcile after split: {e}"); + return; + } + + // Take the state lock to discover the attached & secondary intents for all shards + let (attached, secondary) = { + let locked = self.inner.read().unwrap(); + let mut attached = Vec::new(); + let mut secondary = Vec::new(); + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + let Some(node_id) = shard.intent.get_attached() else { + // Unexpected. Race with a PlacementPolicy change? + tracing::warn!( + "No attached node on {tenant_shard_id} immediately after shard split!" + ); + continue; + }; + + let Some(secondary_node_id) = shard.intent.get_secondary().first() else { + // No secondary location. Nothing for us to do. + continue; + }; + + let attached_node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + + let secondary_node = locked + .nodes + .get(secondary_node_id) + .expect("Pageservers may not be deleted while referenced"); + + attached.push((*tenant_shard_id, attached_node.clone())); + secondary.push((*tenant_shard_id, secondary_node.clone())); + } + (attached, secondary) + }; + + if secondary.is_empty() { + // No secondary locations; nothing for us to do + return; + } + + for result in self + .tenant_for_shards_api( + attached, + |tenant_shard_id, client| async move { + client.tenant_heatmap_upload(tenant_shard_id).await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + if let Err(e) = result { + tracing::warn!("Error calling heatmap upload after shard split: {e}"); + return; + } + } + + for result in self + .tenant_for_shards_api( + secondary, + |tenant_shard_id, client| async move { + client + .tenant_secondary_download(tenant_shard_id, Some(Duration::ZERO)) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await + { + if let Err(e) = result { + tracing::warn!("Error calling secondary download after shard split: {e}"); + return; + } + } + } + + pub(crate) async fn tenant_shard_split( + &self, + tenant_id: TenantId, + split_req: TenantShardSplitRequest, + ) -> Result { + // TODO: return 503 if we get stuck waiting for this lock + // (issue https://github.com/neondatabase/neon/issues/7108) + let _tenant_lock = trace_exclusive_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::ShardSplit, + ) + .await; + + let new_shard_count = ShardCount::new(split_req.new_shard_count); + let new_stripe_size = split_req.new_stripe_size; + + // Validate the request and construct parameters. This phase is fallible, but does not require + // rollback on errors, as it does no I/O and mutates no state. + let shard_split_params = match self.prepare_tenant_shard_split(tenant_id, split_req)? { + ShardSplitAction::NoOp(resp) => return Ok(resp), + ShardSplitAction::Split(params) => params, + }; + + // Execute this split: this phase mutates state and does remote I/O on pageservers. If it fails, + // we must roll back. + let r = self + .do_tenant_shard_split(tenant_id, shard_split_params) + .await; + + let (response, waiters) = match r { + Ok(r) => r, + Err(e) => { + // Split might be part-done, we must do work to abort it. + tracing::warn!("Enqueuing background abort of split on {tenant_id}"); + self.abort_tx + .send(TenantShardSplitAbort { + tenant_id, + new_shard_count, + new_stripe_size, + _tenant_lock, + }) + // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. + .ok(); + return Err(e); + } + }; + + // The split is now complete. As an optimization, we will trigger all the child shards to upload + // a heatmap immediately, and all their secondary locations to start downloading: this avoids waiting + // for the background heatmap/download interval before secondaries get warm enough to migrate shards + // in [`Self::optimize_all`] + self.tenant_shard_split_start_secondaries(tenant_id, waiters) + .await; + Ok(response) + } + + fn prepare_tenant_shard_split( + &self, + tenant_id: TenantId, + split_req: TenantShardSplitRequest, + ) -> Result { + fail::fail_point!("shard-split-validation", |_| Err(ApiError::BadRequest( + anyhow::anyhow!("failpoint") + ))); + + let mut policy = None; + let mut config = None; + let mut shard_ident = None; + // Validate input, and calculate which shards we will create + let (old_shard_count, targets) = + { + let locked = self.inner.read().unwrap(); + + let pageservers = locked.nodes.clone(); + + let mut targets = Vec::new(); + + // In case this is a retry, count how many already-split shards we found + let mut children_found = Vec::new(); + let mut old_shard_count = None; + + for (tenant_shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + match shard.shard.count.count().cmp(&split_req.new_shard_count) { + Ordering::Equal => { + // Already split this + children_found.push(*tenant_shard_id); + continue; + } + Ordering::Greater => { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Requested count {} but already have shards at count {}", + split_req.new_shard_count, + shard.shard.count.count() + ))); + } + Ordering::Less => { + // Fall through: this shard has lower count than requested, + // is a candidate for splitting. + } + } + + match old_shard_count { + None => old_shard_count = Some(shard.shard.count), + Some(old_shard_count) => { + if old_shard_count != shard.shard.count { + // We may hit this case if a caller asked for two splits to + // different sizes, before the first one is complete. + // e.g. 1->2, 2->4, where the 4 call comes while we have a mixture + // of shard_count=1 and shard_count=2 shards in the map. + return Err(ApiError::Conflict( + "Cannot split, currently mid-split".to_string(), + )); + } + } + } + if policy.is_none() { + policy = Some(shard.policy.clone()); + } + if shard_ident.is_none() { + shard_ident = Some(shard.shard); + } + if config.is_none() { + config = Some(shard.config.clone()); + } + + if tenant_shard_id.shard_count.count() == split_req.new_shard_count { + tracing::info!( + "Tenant shard {} already has shard count {}", + tenant_shard_id, + split_req.new_shard_count + ); + continue; + } + + let node_id = shard.intent.get_attached().ok_or(ApiError::BadRequest( + anyhow::anyhow!("Cannot split a tenant that is not attached"), + ))?; + + let node = pageservers + .get(&node_id) + .expect("Pageservers may not be deleted while referenced"); + + targets.push(ShardSplitTarget { + parent_id: *tenant_shard_id, + node: node.clone(), + child_ids: tenant_shard_id + .split(ShardCount::new(split_req.new_shard_count)), + }); + } + + if targets.is_empty() { + if children_found.len() == split_req.new_shard_count as usize { + return Ok(ShardSplitAction::NoOp(TenantShardSplitResponse { + new_shards: children_found, + })); + } else { + // No shards found to split, and no existing children found: the + // tenant doesn't exist at all. + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant {} not found", tenant_id).into(), + )); + } + } + + (old_shard_count, targets) + }; + + // unwrap safety: we would have returned above if we didn't find at least one shard to split + let old_shard_count = old_shard_count.unwrap(); + let shard_ident = if let Some(new_stripe_size) = split_req.new_stripe_size { + // This ShardIdentity will be used as the template for all children, so this implicitly + // applies the new stripe size to the children. + let mut shard_ident = shard_ident.unwrap(); + if shard_ident.count.count() > 1 && shard_ident.stripe_size != new_stripe_size { + return Err(ApiError::BadRequest(anyhow::anyhow!("Attempted to change stripe size ({:?}->{new_stripe_size:?}) on a tenant with multiple shards", shard_ident.stripe_size))); + } + + shard_ident.stripe_size = new_stripe_size; + tracing::info!("applied stripe size {}", shard_ident.stripe_size.0); + shard_ident + } else { + shard_ident.unwrap() + }; + let policy = policy.unwrap(); + let config = config.unwrap(); + + Ok(ShardSplitAction::Split(ShardSplitParams { + old_shard_count, + new_shard_count: ShardCount::new(split_req.new_shard_count), + new_stripe_size: split_req.new_stripe_size, + targets, + policy, + config, + shard_ident, + })) + } + + async fn do_tenant_shard_split( + &self, + tenant_id: TenantId, + params: ShardSplitParams, + ) -> Result<(TenantShardSplitResponse, Vec), ApiError> { + // FIXME: we have dropped self.inner lock, and not yet written anything to the database: another + // request could occur here, deleting or mutating the tenant. begin_shard_split checks that the + // parent shards exist as expected, but it would be neater to do the above pre-checks within the + // same database transaction rather than pre-check in-memory and then maybe-fail the database write. + // (https://github.com/neondatabase/neon/issues/6676) + + let ShardSplitParams { + old_shard_count, + new_shard_count, + new_stripe_size, + mut targets, + policy, + config, + shard_ident, + } = params; + + // Drop any secondary locations: pageservers do not support splitting these, and in any case the + // end-state for a split tenant will usually be to have secondary locations on different nodes. + // The reconciliation calls in this block also implicitly cancel+barrier wrt any ongoing reconciliation + // at the time of split. + let waiters = { + let mut locked = self.inner.write().unwrap(); + let mut waiters = Vec::new(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for target in &mut targets { + let Some(shard) = tenants.get_mut(&target.parent_id) else { + // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard {} not found", + target.parent_id + ))); + }; + + if shard.intent.get_attached() != &Some(target.node.get_id()) { + // Paranoia check: this shouldn't happen: we have the oplock for this tenant ID. + return Err(ApiError::Conflict(format!( + "Shard {} unexpectedly rescheduled during split", + target.parent_id + ))); + } + + // Irrespective of PlacementPolicy, clear secondary locations from intent + shard.intent.clear_secondary(scheduler); + + // Run Reconciler to execute detach fo secondary locations. + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + waiters.push(waiter); + } + } + waiters + }; + self.await_waiters(waiters, RECONCILE_TIMEOUT).await?; + + // Before creating any new child shards in memory or on the pageservers, persist them: this + // enables us to ensure that we will always be able to clean up if something goes wrong. This also + // acts as the protection against two concurrent attempts to split: one of them will get a database + // error trying to insert the child shards. + let mut child_tsps = Vec::new(); + for target in &targets { + let mut this_child_tsps = Vec::new(); + for child in &target.child_ids { + let mut child_shard = shard_ident; + child_shard.number = child.shard_number; + child_shard.count = child.shard_count; + + tracing::info!( + "Create child shard persistence with stripe size {}", + shard_ident.stripe_size.0 + ); + + this_child_tsps.push(TenantShardPersistence { + tenant_id: child.tenant_id.to_string(), + shard_number: child.shard_number.0 as i32, + shard_count: child.shard_count.literal() as i32, + shard_stripe_size: shard_ident.stripe_size.0 as i32, + // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will + // populate the correct generation as part of its transaction, to protect us + // against racing with changes in the state of the parent. + generation: None, + generation_pageserver: Some(target.node.get_id().0 as i64), + placement_policy: serde_json::to_string(&policy).unwrap(), + config: serde_json::to_string(&config).unwrap(), + splitting: SplitState::Splitting, + + // Scheduling policies do not carry through to children + scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) + .unwrap(), + }); + } + + child_tsps.push((target.parent_id, this_child_tsps)); + } + + if let Err(e) = self + .persistence + .begin_shard_split(old_shard_count, tenant_id, child_tsps) + .await + { + match e { + DatabaseError::Query(diesel::result::Error::DatabaseError( + DatabaseErrorKind::UniqueViolation, + _, + )) => { + // Inserting a child shard violated a unique constraint: we raced with another call to + // this function + tracing::warn!("Conflicting attempt to split {tenant_id}: {e}"); + return Err(ApiError::Conflict("Tenant is already splitting".into())); + } + _ => return Err(ApiError::InternalServerError(e.into())), + } + } + fail::fail_point!("shard-split-post-begin", |_| Err( + ApiError::InternalServerError(anyhow::anyhow!("failpoint")) + )); + + // Now that I have persisted the splitting state, apply it in-memory. This is infallible, so + // callers may assume that if splitting is set in memory, then it was persisted, and if splitting + // is not set in memory, then it was not persisted. + { + let mut locked = self.inner.write().unwrap(); + for target in &targets { + if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) { + parent_shard.splitting = SplitState::Splitting; + // Put the observed state to None, to reflect that it is indeterminate once we start the + // split operation. + parent_shard + .observed + .locations + .insert(target.node.get_id(), ObservedStateLocation { conf: None }); + } + } + } + + // TODO: issue split calls concurrently (this only matters once we're splitting + // N>1 shards into M shards -- initially we're usually splitting 1 shard into N). + + for target in &targets { + let ShardSplitTarget { + parent_id, + node, + child_ids, + } = target; + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + let response = client + .tenant_shard_split( + *parent_id, + TenantShardSplitRequest { + new_shard_count: new_shard_count.literal(), + new_stripe_size, + }, + ) + .await + .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?; + + fail::fail_point!("shard-split-post-remote", |_| Err(ApiError::Conflict( + "failpoint".to_string() + ))); + + tracing::info!( + "Split {} into {}", + parent_id, + response + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + + if &response.new_shards != child_ids { + // This should never happen: the pageserver should agree with us on how shard splits work. + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Splitting shard {} resulted in unexpected IDs: {:?} (expected {:?})", + parent_id, + response.new_shards, + child_ids + ))); + } + } + + // TODO: if the pageserver restarted concurrently with our split API call, + // the actual generation of the child shard might differ from the generation + // we expect it to have. In order for our in-database generation to end up + // correct, we should carry the child generation back in the response and apply it here + // in complete_shard_split (and apply the correct generation in memory) + // (or, we can carry generation in the request and reject the request if + // it doesn't match, but that requires more retry logic on this side) + + self.persistence + .complete_shard_split(tenant_id, old_shard_count) + .await?; + + fail::fail_point!("shard-split-post-complete", |_| Err( + ApiError::InternalServerError(anyhow::anyhow!("failpoint")) + )); + + // Replace all the shards we just split with their children: this phase is infallible. + let (response, child_locations, waiters) = + self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); + + // Send compute notifications for all the new shards + let mut failed_notifications = Vec::new(); + for (child_id, child_ps, stripe_size) in child_locations { + if let Err(e) = self + .compute_hook + .notify(child_id, child_ps, stripe_size, &self.cancel) + .await + { + tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})", + child_id, child_ps); + failed_notifications.push(child_id); + } + } + + // If we failed any compute notifications, make a note to retry later. + if !failed_notifications.is_empty() { + let mut locked = self.inner.write().unwrap(); + for failed in failed_notifications { + if let Some(shard) = locked.tenants.get_mut(&failed) { + shard.pending_compute_notification = true; + } + } + } + + Ok((response, waiters)) + } + + pub(crate) async fn tenant_shard_migrate( + &self, + tenant_shard_id: TenantShardId, + migrate_req: TenantShardMigrateRequest, + ) -> Result { + let waiter = { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let Some(node) = nodes.get(&migrate_req.node_id) else { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Node {} not found", + migrate_req.node_id + ))); + }; + + if !node.is_available() { + // Warn but proceed: the caller may intend to manually adjust the placement of + // a shard even if the node is down, e.g. if intervening during an incident. + tracing::warn!("Migrating to unavailable node {node}"); + } + + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + if shard.intent.get_attached() == &Some(migrate_req.node_id) { + // No-op case: we will still proceed to wait for reconciliation in case it is + // incomplete from an earlier update to the intent. + tracing::info!("Migrating: intent is unchanged {:?}", shard.intent); + } else { + let old_attached = *shard.intent.get_attached(); + + match shard.policy { + PlacementPolicy::Attached(n) => { + // If our new attached node was a secondary, it no longer should be. + shard.intent.remove_secondary(scheduler, migrate_req.node_id); + + // If we were already attached to something, demote that to a secondary + if let Some(old_attached) = old_attached { + if n > 0 { + // Remove other secondaries to make room for the location we'll demote + while shard.intent.get_secondary().len() >= n { + shard.intent.pop_secondary(scheduler); + } + + shard.intent.push_secondary(scheduler, old_attached); + } + } + + shard.intent.set_attached(scheduler, Some(migrate_req.node_id)); + } + PlacementPolicy::Secondary => { + shard.intent.clear(scheduler); + shard.intent.push_secondary(scheduler, migrate_req.node_id); + } + PlacementPolicy::Detached => { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first" + ))) + } + } + + tracing::info!("Migrating: new intent {:?}", shard.intent); + shard.sequence = shard.sequence.next(); + } + + self.maybe_reconcile_shard(shard, nodes) + }; + + if let Some(waiter) = waiter { + waiter.wait_timeout(RECONCILE_TIMEOUT).await?; + } else { + tracing::info!("Migration is a no-op"); + } + + Ok(TenantShardMigrateResponse {}) + } + + /// This is for debug/support only: we simply drop all state for a tenant, without + /// detaching or deleting it on pageservers. + pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> { + self.persistence.delete_tenant(tenant_id).await?; + + let mut locked = self.inner.write().unwrap(); + let (_nodes, tenants, scheduler) = locked.parts_mut(); + let mut shards = Vec::new(); + for (tenant_shard_id, _) in tenants.range(TenantShardId::tenant_range(tenant_id)) { + shards.push(*tenant_shard_id); + } + + for shard_id in shards { + if let Some(mut shard) = tenants.remove(&shard_id) { + shard.intent.clear(scheduler); + } + } + + Ok(()) + } + + /// This is for debug/support only: assuming tenant data is already present in S3, we "create" a + /// tenant with a very high generation number so that it will see the existing data. + pub(crate) async fn tenant_import( + &self, + tenant_id: TenantId, + ) -> Result { + // Pick an arbitrary available pageserver to use for scanning the tenant in remote storage + let maybe_node = { + self.inner + .read() + .unwrap() + .nodes + .values() + .find(|n| n.is_available()) + .cloned() + }; + let Some(node) = maybe_node else { + return Err(ApiError::BadRequest(anyhow::anyhow!("No nodes available"))); + }; + + let client = PageserverClient::new( + node.get_id(), + node.base_url(), + self.config.jwt_token.as_deref(), + ); + + let scan_result = client + .tenant_scan_remote_storage(tenant_id) + .await + .map_err(|e| passthrough_api_error(&node, e))?; + + // A post-split tenant may contain a mixture of shard counts in remote storage: pick the highest count. + let Some(shard_count) = scan_result + .shards + .iter() + .map(|s| s.tenant_shard_id.shard_count) + .max() + else { + return Err(ApiError::NotFound( + anyhow::anyhow!("No shards found").into(), + )); + }; + + // Ideally we would set each newly imported shard's generation independently, but for correctness it is sufficient + // to + let generation = scan_result + .shards + .iter() + .map(|s| s.generation) + .max() + .expect("We already validated >0 shards"); + + // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will + // only work if they were using the default stripe size. + let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE; + + let (response, waiters) = self + .do_tenant_create(TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation, + + shard_parameters: ShardParameters { + count: shard_count, + stripe_size, + }, + placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking + + // There is no way to know what the tenant's config was: revert to defaults + config: TenantConfig::default(), + }) + .await?; + + if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await { + // Since this is a debug/support operation, all kinds of weird issues are possible (e.g. this + // tenant doesn't exist in the control plane), so don't fail the request if it can't fully + // reconcile, as reconciliation includes notifying compute. + tracing::warn!(%tenant_id, "Reconcile not done yet while importing tenant ({e})"); + } + + Ok(response) + } + + /// For debug/support: a full JSON dump of TenantShards. Returns a response so that + /// we don't have to make TenantShard clonable in the return path. + pub(crate) fn tenants_dump(&self) -> Result, ApiError> { + let serialized = { + let locked = self.inner.read().unwrap(); + let result = locked.tenants.values().collect::>(); + serde_json::to_string(&result).map_err(|e| ApiError::InternalServerError(e.into()))? + }; + + hyper::Response::builder() + .status(hyper::StatusCode::OK) + .header(hyper::header::CONTENT_TYPE, "application/json") + .body(hyper::Body::from(serialized)) + .map_err(|e| ApiError::InternalServerError(e.into())) + } + + /// Check the consistency of in-memory state vs. persistent state, and check that the + /// scheduler's statistics are up to date. + /// + /// These consistency checks expect an **idle** system. If changes are going on while + /// we run, then we can falsely indicate a consistency issue. This is sufficient for end-of-test + /// checks, but not suitable for running continuously in the background in the field. + pub(crate) async fn consistency_check(&self) -> Result<(), ApiError> { + let (mut expect_nodes, mut expect_shards) = { + let locked = self.inner.read().unwrap(); + + locked + .scheduler + .consistency_check(locked.nodes.values(), locked.tenants.values()) + .context("Scheduler checks") + .map_err(ApiError::InternalServerError)?; + + let expect_nodes = locked + .nodes + .values() + .map(|n| n.to_persistent()) + .collect::>(); + + let expect_shards = locked + .tenants + .values() + .map(|t| t.to_persistent()) + .collect::>(); + + // This method can only validate the state of an idle system: if a reconcile is in + // progress, fail out early to avoid giving false errors on state that won't match + // between database and memory under a ReconcileResult is processed. + for t in locked.tenants.values() { + if t.reconciler.is_some() { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard {} reconciliation in progress", + t.tenant_shard_id + ))); + } + } + + (expect_nodes, expect_shards) + }; + + let mut nodes = self.persistence.list_nodes().await?; + expect_nodes.sort_by_key(|n| n.node_id); + nodes.sort_by_key(|n| n.node_id); + + if nodes != expect_nodes { + tracing::error!("Consistency check failed on nodes."); + tracing::error!( + "Nodes in memory: {}", + serde_json::to_string(&expect_nodes) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + tracing::error!( + "Nodes in database: {}", + serde_json::to_string(&nodes) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Node consistency failure" + ))); + } + + let mut shards = self.persistence.list_tenant_shards().await?; + shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count)); + + if shards != expect_shards { + tracing::error!("Consistency check failed on shards."); + tracing::error!( + "Shards in memory: {}", + serde_json::to_string(&expect_shards) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + tracing::error!( + "Shards in database: {}", + serde_json::to_string(&shards) + .map_err(|e| ApiError::InternalServerError(e.into()))? + ); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Shard consistency failure" + ))); + } + + Ok(()) + } + + /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that + /// we don't have to make TenantShard clonable in the return path. + pub(crate) fn scheduler_dump(&self) -> Result, ApiError> { + let serialized = { + let locked = self.inner.read().unwrap(); + serde_json::to_string(&locked.scheduler) + .map_err(|e| ApiError::InternalServerError(e.into()))? + }; + + hyper::Response::builder() + .status(hyper::StatusCode::OK) + .header(hyper::header::CONTENT_TYPE, "application/json") + .body(hyper::Body::from(serialized)) + .map_err(|e| ApiError::InternalServerError(e.into())) + } + + /// This is for debug/support only: we simply drop all state for a tenant, without + /// detaching or deleting it on pageservers. We do not try and re-schedule any + /// tenants that were on this node. + /// + /// TODO: proper node deletion API that unhooks things more gracefully + pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> { + self.persistence.delete_node(node_id).await?; + + let mut locked = self.inner.write().unwrap(); + + for shard in locked.tenants.values_mut() { + shard.deref_node(node_id); + } + + let mut nodes = (*locked.nodes).clone(); + nodes.remove(&node_id); + locked.nodes = Arc::new(nodes); + + locked.scheduler.node_remove(node_id); + + Ok(()) + } + + pub(crate) async fn node_list(&self) -> Result, ApiError> { + let nodes = { + self.inner + .read() + .unwrap() + .nodes + .values() + .cloned() + .collect::>() + }; + + Ok(nodes) + } + + pub(crate) async fn get_node(&self, node_id: NodeId) -> Result { + self.inner + .read() + .unwrap() + .nodes + .get(&node_id) + .cloned() + .ok_or(ApiError::NotFound( + format!("Node {node_id} not registered").into(), + )) + } + + pub(crate) async fn node_register( + &self, + register_req: NodeRegisterRequest, + ) -> Result<(), ApiError> { + let _node_lock = trace_exclusive_lock( + &self.node_op_locks, + register_req.node_id, + NodeOperations::Register, + ) + .await; + + { + let locked = self.inner.read().unwrap(); + if let Some(node) = locked.nodes.get(®ister_req.node_id) { + // Note that we do not do a total equality of the struct, because we don't require + // the availability/scheduling states to agree for a POST to be idempotent. + if node.registration_match(®ister_req) { + tracing::info!( + "Node {} re-registered with matching address", + register_req.node_id + ); + return Ok(()); + } else { + // TODO: decide if we want to allow modifying node addresses without removing and re-adding + // the node. Safest/simplest thing is to refuse it, and usually we deploy with + // a fixed address through the lifetime of a node. + tracing::warn!( + "Node {} tried to register with different address", + register_req.node_id + ); + return Err(ApiError::Conflict( + "Node is already registered with different address".to_string(), + )); + } + } + } + + // We do not require that a node is actually online when registered (it will start life + // with it's availability set to Offline), but we _do_ require that its DNS record exists. We're + // therefore not immune to asymmetric L3 connectivity issues, but we are protected against nodes + // that register themselves with a broken DNS config. We check only the HTTP hostname, because + // the postgres hostname might only be resolvable to clients (e.g. if we're on a different VPC than clients). + if tokio::net::lookup_host(format!( + "{}:{}", + register_req.listen_http_addr, register_req.listen_http_port + )) + .await + .is_err() + { + // If we have a transient DNS issue, it's up to the caller to retry their registration. Because + // we can't robustly distinguish between an intermittent issue and a totally bogus DNS situation, + // we return a soft 503 error, to encourage callers to retry past transient issues. + return Err(ApiError::ResourceUnavailable( + format!( + "Node {} tried to register with unknown DNS name '{}'", + register_req.node_id, register_req.listen_http_addr + ) + .into(), + )); + } + + // Ordering: we must persist the new node _before_ adding it to in-memory state. + // This ensures that before we use it for anything or expose it via any external + // API, it is guaranteed to be available after a restart. + let new_node = Node::new( + register_req.node_id, + register_req.listen_http_addr, + register_req.listen_http_port, + register_req.listen_pg_addr, + register_req.listen_pg_port, + ); + + // TODO: idempotency if the node already exists in the database + self.persistence.insert_node(&new_node).await?; + + let mut locked = self.inner.write().unwrap(); + let mut new_nodes = (*locked.nodes).clone(); + + locked.scheduler.node_upsert(&new_node); + new_nodes.insert(register_req.node_id, new_node); + + locked.nodes = Arc::new(new_nodes); + + tracing::info!( + "Registered pageserver {}, now have {} pageservers", + register_req.node_id, + locked.nodes.len() + ); + Ok(()) + } + + pub(crate) async fn node_configure( + &self, + node_id: NodeId, + availability: Option, + scheduling: Option, + ) -> Result<(), ApiError> { + let _node_lock = + trace_exclusive_lock(&self.node_op_locks, node_id, NodeOperations::Configure).await; + + if let Some(scheduling) = scheduling { + // Scheduling is a persistent part of Node: we must write updates to the database before + // applying them in memory + self.persistence.update_node(node_id, scheduling).await?; + } + + // If we're activating a node, then before setting it active we must reconcile any shard locations + // on that node, in case it is out of sync, e.g. due to being unavailable during controller startup, + // by calling [`Self::node_activate_reconcile`] + // + // The transition we calculate here remains valid later in the function because we hold the op lock on the node: + // nothing else can mutate its availability while we run. + let availability_transition = if let Some(input_availability) = availability { + let (activate_node, availability_transition) = { + let locked = self.inner.read().unwrap(); + let Some(node) = locked.nodes.get(&node_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + )); + }; + + ( + node.clone(), + node.get_availability_transition(input_availability), + ) + }; + + if matches!(availability_transition, AvailabilityTransition::ToActive) { + self.node_activate_reconcile(activate_node, &_node_lock) + .await?; + } + availability_transition + } else { + AvailabilityTransition::Unchanged + }; + + // Apply changes from the request to our in-memory state for the Node + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let mut new_nodes = (**nodes).clone(); + + let Some(node) = new_nodes.get_mut(&node_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Node not registered").into(), + )); + }; + + if let Some(availability) = &availability { + node.set_availability(*availability); + } + + if let Some(scheduling) = scheduling { + node.set_scheduling(scheduling); + } + + // Update the scheduler, in case the elegibility of the node for new shards has changed + scheduler.node_upsert(node); + + let new_nodes = Arc::new(new_nodes); + + // Modify scheduling state for any Tenants that are affected by a change in the node's availability state. + match availability_transition { + AvailabilityTransition::ToOffline => { + tracing::info!("Node {} transition to offline", node_id); + let mut tenants_affected: usize = 0; + + for (tenant_shard_id, tenant_shard) in tenants { + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { + // When a node goes offline, we set its observed configuration to None, indicating unknown: we will + // not assume our knowledge of the node's configuration is accurate until it comes back online + observed_loc.conf = None; + } + + if new_nodes.len() == 1 { + // Special case for single-node cluster: there is no point trying to reschedule + // any tenant shards: avoid doing so, in order to avoid spewing warnings about + // failures to schedule them. + continue; + } + + if !new_nodes + .values() + .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_))) + { + // Special case for when all nodes are unavailable and/or unschedulable: there is no point + // trying to reschedule since there's nowhere else to go. Without this + // branch we incorrectly detach tenants in response to node unavailability. + continue; + } + + if tenant_shard.intent.demote_attached(scheduler, node_id) { + tenant_shard.sequence = tenant_shard.sequence.next(); + + // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters + // for tenants without secondary locations: if they have a secondary location, then this + // schedule() call is just promoting an existing secondary) + let mut schedule_context = ScheduleContext::default(); + + match tenant_shard.schedule(scheduler, &mut schedule_context) { + Err(e) => { + // It is possible that some tenants will become unschedulable when too many pageservers + // go offline: in this case there isn't much we can do other than make the issue observable. + // TODO: give TenantShard a scheduling error attribute to be queried later. + tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); + } + Ok(()) => { + if self + .maybe_reconcile_shard(tenant_shard, &new_nodes) + .is_some() + { + tenants_affected += 1; + }; + } + } + } + } + tracing::info!( + "Launched {} reconciler tasks for tenants affected by node {} going offline", + tenants_affected, + node_id + ) + } + AvailabilityTransition::ToActive => { + tracing::info!("Node {} transition to active", node_id); + // When a node comes back online, we must reconcile any tenant that has a None observed + // location on the node. + for tenant_shard in locked.tenants.values_mut() { + // If a reconciliation is already in progress, rely on the previous scheduling + // decision and skip triggering a new reconciliation. + if tenant_shard.reconciler.is_some() { + continue; + } + + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { + if observed_loc.conf.is_none() { + self.maybe_reconcile_shard(tenant_shard, &new_nodes); + } + } + } + + // TODO: in the background, we should balance work back onto this pageserver + } + AvailabilityTransition::Unchanged => { + tracing::debug!("Node {} no availability change during config", node_id); + } + } + + locked.nodes = new_nodes; + + Ok(()) + } + + pub(crate) async fn start_node_drain( + self: &Arc, + node_id: NodeId, + ) -> Result<(), ApiError> { + let (ongoing_op, node_available, node_policy, schedulable_nodes_count) = { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + let node = nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + let schedulable_nodes_count = nodes + .iter() + .filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_))) + .count(); + + ( + locked + .ongoing_operation + .as_ref() + .map(|ongoing| ongoing.operation), + node.is_available(), + node.get_scheduling(), + schedulable_nodes_count, + ) + }; + + if let Some(ongoing) = ongoing_op { + return Err(ApiError::PreconditionFailed( + format!("Background operation already ongoing for node: {}", ongoing).into(), + )); + } + + if !node_available { + return Err(ApiError::ResourceUnavailable( + format!("Node {node_id} is currently unavailable").into(), + )); + } + + if schedulable_nodes_count == 0 { + return Err(ApiError::PreconditionFailed( + "No other schedulable nodes to drain to".into(), + )); + } + + match node_policy { + NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => { + self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining)) + .await?; + + let cancel = CancellationToken::new(); + + self.inner.write().unwrap().ongoing_operation = Some(OperationHandler { + operation: Operation::Drain(Drain { node_id }), + cancel: cancel.clone(), + }); + + tokio::task::spawn({ + let service = self.clone(); + let cancel = cancel.clone(); + async move { + scopeguard::defer! { + let prev = service.inner.write().unwrap().ongoing_operation.take(); + + if let Some(Operation::Drain(removed_drain)) = prev.map(|h| h.operation) { + assert_eq!(removed_drain.node_id, node_id, "We always take the same operation"); + } else { + panic!("We always remove the same operation") + } + } + + tracing::info!(%node_id, "Drain background operation starting"); + let res = service.drain_node(node_id, cancel).await; + match res { + Ok(()) => { + tracing::info!(%node_id, "Drain background operation completed successfully"); + } + Err(OperationError::Cancelled) => { + tracing::info!(%node_id, "Drain background operation was cancelled"); + } + Err(err) => { + tracing::error!(%node_id, "Drain background operation encountered: {err}") + } + } + } + }); + } + NodeSchedulingPolicy::Draining => { + return Err(ApiError::Conflict(format!( + "Node {node_id} has drain in progress" + ))); + } + policy => { + return Err(ApiError::PreconditionFailed( + format!("Node {node_id} cannot be drained due to {policy:?} policy").into(), + )); + } + } + + Ok(()) + } + + pub(crate) async fn start_node_fill(self: &Arc, node_id: NodeId) -> Result<(), ApiError> { + let (ongoing_op, node_available, node_policy, total_nodes_count) = { + let locked = self.inner.read().unwrap(); + let nodes = &locked.nodes; + let node = nodes.get(&node_id).ok_or(ApiError::NotFound( + anyhow::anyhow!("Node {} not registered", node_id).into(), + ))?; + + ( + locked + .ongoing_operation + .as_ref() + .map(|ongoing| ongoing.operation), + node.is_available(), + node.get_scheduling(), + nodes.len(), + ) + }; + + if let Some(ongoing) = ongoing_op { + return Err(ApiError::PreconditionFailed( + format!("Background operation already ongoing for node: {}", ongoing).into(), + )); + } + + if !node_available { + return Err(ApiError::ResourceUnavailable( + format!("Node {node_id} is currently unavailable").into(), + )); + } + + if total_nodes_count <= 1 { + return Err(ApiError::PreconditionFailed( + "No other nodes to fill from".into(), + )); + } + + match node_policy { + NodeSchedulingPolicy::Active => { + self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling)) + .await?; + + let cancel = CancellationToken::new(); + + self.inner.write().unwrap().ongoing_operation = Some(OperationHandler { + operation: Operation::Fill(Fill { node_id }), + cancel: cancel.clone(), + }); + + tokio::task::spawn({ + let service = self.clone(); + let cancel = cancel.clone(); + async move { + scopeguard::defer! { + let prev = service.inner.write().unwrap().ongoing_operation.take(); + + if let Some(Operation::Fill(removed_fill)) = prev.map(|h| h.operation) { + assert_eq!(removed_fill.node_id, node_id, "We always take the same operation"); + } else { + panic!("We always remove the same operation") + } + } + + tracing::info!(%node_id, "Fill background operation starting"); + let res = service.fill_node(node_id, cancel).await; + match res { + Ok(()) => { + tracing::info!(%node_id, "Fill background operation completed successfully"); + } + Err(OperationError::Cancelled) => { + tracing::info!(%node_id, "Fill background operation was cancelled"); + } + Err(err) => { + tracing::error!(%node_id, "Fill background operation encountered: {err}") + } + } + } + }); + } + NodeSchedulingPolicy::Filling => { + return Err(ApiError::Conflict(format!( + "Node {node_id} has fill in progress" + ))); + } + policy => { + return Err(ApiError::PreconditionFailed( + format!("Node {node_id} cannot be filled due to {policy:?} policy").into(), + )); + } + } + + Ok(()) + } + + /// Helper for methods that will try and call pageserver APIs for + /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant + /// is attached somewhere. + fn ensure_attached_schedule( + &self, + mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, + tenant_id: TenantId, + ) -> Result, anyhow::Error> { + let mut waiters = Vec::new(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let mut schedule_context = ScheduleContext::default(); + for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { + shard.schedule(scheduler, &mut schedule_context)?; + + // The shard's policies may not result in an attached location being scheduled: this + // is an error because our caller needs it attached somewhere. + if shard.intent.get_attached().is_none() { + return Err(anyhow::anyhow!( + "Tenant {tenant_id} not scheduled to be attached" + )); + }; + + if shard.stably_attached().is_some() { + // We do not require the shard to be totally up to date on reconciliation: we just require + // that it has been attached on the intended node. Other dirty state such as unattached secondary + // locations, or compute hook notifications can be ignored. + continue; + } + + if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { + tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached"); + waiters.push(waiter); + } + } + Ok(waiters) + } + + async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> { + let ensure_waiters = { + let locked = self.inner.write().unwrap(); + + // Check if the tenant is splitting: in this case, even if it is attached, + // we must act as if it is not: this blocks e.g. timeline creation/deletion + // operations during the split. + for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { + if !matches!(shard.splitting, SplitState::Idle) { + return Err(ApiError::ResourceUnavailable( + "Tenant shards are currently splitting".into(), + )); + } + } + + self.ensure_attached_schedule(locked, tenant_id) + .map_err(ApiError::InternalServerError)? + }; + + let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap(); + for waiter in ensure_waiters { + let timeout = deadline.duration_since(Instant::now()); + waiter.wait_timeout(timeout).await?; + } + + Ok(()) + } + + /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`], + fn maybe_reconcile_shard( + &self, + shard: &mut TenantShard, + nodes: &Arc>, + ) -> Option { + let reconcile_needed = shard.get_reconcile_needed(nodes); + + match reconcile_needed { + ReconcileNeeded::No => return None, + ReconcileNeeded::WaitExisting(waiter) => return Some(waiter), + ReconcileNeeded::Yes => { + // Fall through to try and acquire units for spawning reconciler + } + }; + + let units = match self.reconciler_concurrency.clone().try_acquire_owned() { + Ok(u) => ReconcileUnits::new(u), + Err(_) => { + tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(), + "Concurrency limited: enqueued for reconcile later"); + if !shard.delayed_reconcile { + match self.delayed_reconcile_tx.try_send(shard.tenant_shard_id) { + Err(TrySendError::Closed(_)) => { + // Weird mid-shutdown case? + } + Err(TrySendError::Full(_)) => { + // It is safe to skip sending our ID in the channel: we will eventually get retried by the background reconcile task. + tracing::warn!( + "Many shards are waiting to reconcile: delayed_reconcile queue is full" + ); + } + Ok(()) => { + shard.delayed_reconcile = true; + } + } + } + + // We won't spawn a reconciler, but we will construct a waiter that waits for the shard's sequence + // number to advance. When this function is eventually called again and succeeds in getting units, + // it will spawn a reconciler that makes this waiter complete. + return Some(shard.future_reconcile_waiter()); + } + }; + + let Ok(gate_guard) = self.gate.enter() else { + // Gate closed: we're shutting down, drop out. + return None; + }; + + shard.spawn_reconciler( + &self.result_tx, + nodes, + &self.compute_hook, + &self.config, + &self.persistence, + units, + gate_guard, + &self.cancel, + ) + } + + /// Check all tenants for pending reconciliation work, and reconcile those in need. + /// Additionally, reschedule tenants that require it. + /// + /// Returns how many reconciliation tasks were started, or `1` if no reconciles were + /// spawned but some _would_ have been spawned if `reconciler_concurrency` units where + /// available. A return value of 0 indicates that everything is fully reconciled already. + fn reconcile_all(&self) -> usize { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, _scheduler) = locked.parts_mut(); + let pageservers = nodes.clone(); + + let mut schedule_context = ScheduleContext::default(); + + let mut reconciles_spawned = 0; + for (tenant_shard_id, shard) in tenants.iter_mut() { + if tenant_shard_id.is_shard_zero() { + schedule_context = ScheduleContext::default(); + } + + // Skip checking if this shard is already enqueued for reconciliation + if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 { + // If there is something delayed, then return a nonzero count so that + // callers like reconcile_all_now do not incorrectly get the impression + // that the system is in a quiescent state. + reconciles_spawned = std::cmp::max(1, reconciles_spawned); + continue; + } + + // Eventual consistency: if an earlier reconcile job failed, and the shard is still + // dirty, spawn another rone + if self.maybe_reconcile_shard(shard, &pageservers).is_some() { + reconciles_spawned += 1; + } + + schedule_context.avoid(&shard.intent.all_pageservers()); + } + + reconciles_spawned + } + + /// `optimize` in this context means identifying shards which have valid scheduled locations, but + /// could be scheduled somewhere better: + /// - Cutting over to a secondary if the node with the secondary is more lightly loaded + /// * e.g. after a node fails then recovers, to move some work back to it + /// - Cutting over to a secondary if it improves the spread of shard attachments within a tenant + /// * e.g. after a shard split, the initial attached locations will all be on the node where + /// we did the split, but are probably better placed elsewhere. + /// - Creating new secondary locations if it improves the spreading of a sharded tenant + /// * e.g. after a shard split, some locations will be on the same node (where the split + /// happened), and will probably be better placed elsewhere. + /// + /// To put it more briefly: whereas the scheduler respects soft constraints in a ScheduleContext at + /// the time of scheduling, this function looks for cases where a better-scoring location is available + /// according to those same soft constraints. + async fn optimize_all(&self) -> usize { + // Limit on how many shards' optmizations each call to this function will execute. Combined + // with the frequency of background calls, this acts as an implicit rate limit that runs a small + // trickle of optimizations in the background, rather than executing a large number in parallel + // when a change occurs. + const MAX_OPTIMIZATIONS_EXEC_PER_PASS: usize = 2; + + // Synchronous prepare: scan shards for possible scheduling optimizations + let candidate_work = self.optimize_all_plan(); + let candidate_work_len = candidate_work.len(); + + // Asynchronous validate: I/O to pageservers to make sure shards are in a good state to apply validation + let validated_work = self.optimize_all_validate(candidate_work).await; + + let was_work_filtered = validated_work.len() != candidate_work_len; + + // Synchronous apply: update the shards' intent states according to validated optimisations + let mut reconciles_spawned = 0; + let mut optimizations_applied = 0; + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (tenant_shard_id, optimization) in validated_work { + let Some(shard) = tenants.get_mut(&tenant_shard_id) else { + // Shard was dropped between planning and execution; + continue; + }; + if shard.apply_optimization(scheduler, optimization) { + optimizations_applied += 1; + if self.maybe_reconcile_shard(shard, nodes).is_some() { + reconciles_spawned += 1; + } + } + + if optimizations_applied >= MAX_OPTIMIZATIONS_EXEC_PER_PASS { + break; + } + } + + if was_work_filtered { + // If we filtered any work out during validation, ensure we return a nonzero value to indicate + // to callers that the system is not in a truly quiet state, it's going to do some work as soon + // as these validations start passing. + reconciles_spawned = std::cmp::max(reconciles_spawned, 1); + } + + reconciles_spawned + } + + fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> { + let mut schedule_context = ScheduleContext::default(); + + let mut tenant_shards: Vec<&TenantShard> = Vec::new(); + + // How many candidate optimizations we will generate, before evaluating them for readniess: setting + // this higher than the execution limit gives us a chance to execute some work even if the first + // few optimizations we find are not ready. + const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8; + + let mut work = Vec::new(); + + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + for (tenant_shard_id, shard) in tenants.iter() { + if tenant_shard_id.is_shard_zero() { + // Reset accumulators on the first shard in a tenant + schedule_context = ScheduleContext::default(); + schedule_context.mode = ScheduleMode::Speculative; + tenant_shards.clear(); + } + + if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { + break; + } + + match shard.get_scheduling_policy() { + ShardSchedulingPolicy::Active => { + // Ok to do optimization + } + ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause + | ShardSchedulingPolicy::Stop => { + // Policy prevents optimizing this shard. + continue; + } + } + + // Accumulate the schedule context for all the shards in a tenant: we must have + // the total view of all shards before we can try to optimize any of them. + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + tenant_shards.push(shard); + + // Once we have seen the last shard in the tenant, proceed to search across all shards + // in the tenant for optimizations + if shard.shard.number.0 == shard.shard.count.count() - 1 { + if tenant_shards.iter().any(|s| s.reconciler.is_some()) { + // Do not start any optimizations while another change to the tenant is ongoing: this + // is not necessary for correctness, but simplifies operations and implicitly throttles + // optimization changes to happen in a "trickle" over time. + continue; + } + + if tenant_shards.iter().any(|s| { + !matches!(s.splitting, SplitState::Idle) + || matches!(s.policy, PlacementPolicy::Detached) + }) { + // Never attempt to optimize a tenant that is currently being split, or + // a tenant that is meant to be detached + continue; + } + + // TODO: optimization calculations are relatively expensive: create some fast-path for + // the common idle case (avoiding the search on tenants that we have recently checked) + + for shard in &tenant_shards { + if let Some(optimization) = + // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to + // its primary location based on soft constraints, cut it over. + shard.optimize_attachment(nodes, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } else if let Some(optimization) = + // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be + // better placed on another node, based on ScheduleContext, then adjust it. This + // covers cases like after a shard split, where we might have too many shards + // in the same tenant with secondary locations on the node where they originally split. + shard.optimize_secondary(scheduler, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } + + // TODO: extend this mechanism to prefer attaching on nodes with fewer attached + // tenants (i.e. extend schedule state to distinguish attached from secondary counts), + // for the total number of attachments on a node (not just within a tenant.) + } + } + } + + work + } + + async fn optimize_all_validate( + &self, + candidate_work: Vec<(TenantShardId, ScheduleOptimization)>, + ) -> Vec<(TenantShardId, ScheduleOptimization)> { + // Take a clone of the node map to use outside the lock in async validation phase + let validation_nodes = { self.inner.read().unwrap().nodes.clone() }; + + let mut want_secondary_status = Vec::new(); + + // Validate our plans: this is an async phase where we may do I/O to pageservers to + // check that the state of locations is acceptable to run the optimization, such as + // checking that a secondary location is sufficiently warmed-up to cleanly cut over + // in a live migration. + let mut validated_work = Vec::new(); + for (tenant_shard_id, optimization) in candidate_work { + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: _, + new_attached_node_id, + }) => { + match validation_nodes.get(&new_attached_node_id) { + None => { + // Node was dropped between planning and validation + } + Some(node) => { + if !node.is_available() { + tracing::info!("Skipping optimization migration of {tenant_shard_id} to {new_attached_node_id} because node unavailable"); + } else { + // Accumulate optimizations that require fetching secondary status, so that we can execute these + // remote API requests concurrently. + want_secondary_status.push(( + tenant_shard_id, + node.clone(), + optimization, + )); + } + } + } + } + ScheduleOptimizationAction::ReplaceSecondary(_) => { + // No extra checks needed to replace a secondary: this does not interrupt client access + validated_work.push((tenant_shard_id, optimization)) + } + }; + } + + // Call into pageserver API to find out if the destination secondary location is warm enough for a reasonably smooth migration: we + // do this so that we avoid spawning a Reconciler that would have to wait minutes/hours for a destination to warm up: that reconciler + // would hold a precious reconcile semaphore unit the whole time it was waiting for the destination to warm up. + let results = self + .tenant_for_shards_api( + want_secondary_status + .iter() + .map(|i| (i.0, i.1.clone())) + .collect(), + |tenant_shard_id, client| async move { + client.tenant_secondary_status(tenant_shard_id).await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + for ((tenant_shard_id, node, optimization), secondary_status) in + want_secondary_status.into_iter().zip(results.into_iter()) + { + match secondary_status { + Err(e) => { + tracing::info!("Skipping migration of {tenant_shard_id} to {node}, error querying secondary: {e}"); + } + Ok(progress) => { + // We require secondary locations to have less than 10GiB of downloads pending before we will use + // them in an optimization + const DOWNLOAD_FRESHNESS_THRESHOLD: u64 = 10 * 1024 * 1024 * 1024; + + if progress.heatmap_mtime.is_none() + || progress.bytes_total < DOWNLOAD_FRESHNESS_THRESHOLD + && progress.bytes_downloaded != progress.bytes_total + || progress.bytes_total - progress.bytes_downloaded + > DOWNLOAD_FRESHNESS_THRESHOLD + { + tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + } else { + // Location looks ready: proceed + tracing::info!( + "{tenant_shard_id} secondary on {node} is warm enough for migration: {progress:?}" + ); + validated_work.push((tenant_shard_id, optimization)) + } + } + } + } + + validated_work + } + + /// Look for shards which are oversized and in need of splitting + async fn autosplit_tenants(self: &Arc) { + let Some(split_threshold) = self.config.split_threshold else { + // Auto-splitting is disabled + return; + }; + + let nodes = self.inner.read().unwrap().nodes.clone(); + + const SPLIT_TO_MAX: ShardCount = ShardCount::new(8); + + let mut top_n = Vec::new(); + + // Call into each node to look for big tenants + let top_n_request = TopTenantShardsRequest { + // We currently split based on logical size, for simplicity: logical size is a signal of + // the user's intent to run a large database, whereas physical/resident size can be symptoms + // of compaction issues. Eventually we should switch to using resident size to bound the + // disk space impact of one shard. + order_by: models::TenantSorting::MaxLogicalSize, + limit: 10, + where_shards_lt: Some(SPLIT_TO_MAX), + where_gt: Some(split_threshold), + }; + for node in nodes.values() { + let request_ref = &top_n_request; + match node + .with_client_retries( + |client| async move { + let request = request_ref.clone(); + client.top_tenant_shards(request.clone()).await + }, + &self.config.jwt_token, + 3, + 3, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(node_top_n)) => { + top_n.extend(node_top_n.shards.into_iter()); + } + Some(Err(mgmt_api::Error::Cancelled)) => { + continue; + } + Some(Err(e)) => { + tracing::warn!("Failed to fetch top N tenants from {node}: {e}"); + continue; + } + None => { + // Node is shutting down + continue; + } + }; + } + + // Pick the biggest tenant to split first + top_n.sort_by_key(|i| i.resident_size); + let Some(split_candidate) = top_n.into_iter().next() else { + tracing::debug!("No split-elegible shards found"); + return; + }; + + // We spawn a task to run this, so it's exactly like some external API client requesting it. We don't + // want to block the background reconcile loop on this. + tracing::info!("Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}"); + + let this = self.clone(); + tokio::spawn( + async move { + match this + .tenant_shard_split( + split_candidate.id.tenant_id, + TenantShardSplitRequest { + // Always split to the max number of shards: this avoids stepping through + // intervening shard counts and encountering the overrhead of a split+cleanup + // each time as a tenant grows, and is not too expensive because our max shard + // count is relatively low anyway. + // This policy will be adjusted in future once we support higher shard count. + new_shard_count: SPLIT_TO_MAX.literal(), + new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE), + }, + ) + .await + { + Ok(_) => { + tracing::info!("Successful auto-split"); + } + Err(e) => { + tracing::error!("Auto-split failed: {e}"); + } + } + } + .instrument(tracing::info_span!("auto_split", tenant_id=%split_candidate.id.tenant_id)), + ); + } + + /// Useful for tests: run whatever work a background [`Self::reconcile_all`] would have done, but + /// also wait for any generated Reconcilers to complete. Calling this until it returns zero should + /// put the system into a quiescent state where future background reconciliations won't do anything. + pub(crate) async fn reconcile_all_now(&self) -> Result { + let reconciles_spawned = self.reconcile_all(); + let reconciles_spawned = if reconciles_spawned == 0 { + // Only optimize when we are otherwise idle + self.optimize_all().await + } else { + reconciles_spawned + }; + + let waiters = { + let mut waiters = Vec::new(); + let locked = self.inner.read().unwrap(); + for (_tenant_shard_id, shard) in locked.tenants.iter() { + if let Some(waiter) = shard.get_waiter() { + waiters.push(waiter); + } + } + waiters + }; + + let waiter_count = waiters.len(); + match self.await_waiters(waiters, RECONCILE_TIMEOUT).await { + Ok(()) => {} + Err(ReconcileWaitError::Failed(_, reconcile_error)) + if matches!(*reconcile_error, ReconcileError::Cancel) => + { + // Ignore reconciler cancel errors: this reconciler might have shut down + // because some other change superceded it. We will return a nonzero number, + // so the caller knows they might have to call again to quiesce the system. + } + Err(e) => { + return Err(e); + } + }; + + tracing::info!( + "{} reconciles in reconcile_all, {} waiters", + reconciles_spawned, + waiter_count + ); + + Ok(std::cmp::max(waiter_count, reconciles_spawned)) + } + + pub async fn shutdown(&self) { + // Note that this already stops processing any results from reconciles: so + // we do not expect that our [`TenantShard`] objects will reach a neat + // final state. + self.cancel.cancel(); + + // The cancellation tokens in [`crate::reconciler::Reconciler`] are children + // of our cancellation token, so we do not need to explicitly cancel each of + // them. + + // Background tasks and reconcilers hold gate guards: this waits for them all + // to complete. + self.gate.close().await; + } + + /// Drain a node by moving the shards attached to it as primaries. + /// This is a long running operation and it should run as a separate Tokio task. + pub(crate) async fn drain_node( + &self, + node_id: NodeId, + cancel: CancellationToken, + ) -> Result<(), OperationError> { + let mut last_inspected_shard: Option = None; + let mut inspected_all_shards = false; + let mut waiters = Vec::new(); + let mut schedule_context = ScheduleContext::default(); + + while !inspected_all_shards { + if cancel.is_cancelled() { + return Err(OperationError::Cancelled); + } + + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged( + format!("node {node_id} was removed").into(), + ))?; + + let current_policy = node.get_scheduling(); + if !matches!(current_policy, NodeSchedulingPolicy::Draining) { + // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think + // about it + return Err(OperationError::NodeStateChanged( + format!("node {node_id} changed state to {current_policy:?}").into(), + )); + } + + let mut cursor = tenants.iter_mut().skip_while({ + let skip_past = last_inspected_shard; + move |(tid, _)| match skip_past { + Some(last) => **tid != last, + None => false, + } + }); + + while waiters.len() < MAX_RECONCILES_PER_OPERATION { + let (tid, tenant_shard) = match cursor.next() { + Some(some) => some, + None => { + inspected_all_shards = true; + break; + } + }; + + if tenant_shard.intent.demote_attached(scheduler, node_id) { + match tenant_shard.schedule(scheduler, &mut schedule_context) { + Err(e) => { + tracing::warn!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Scheduling error when draining pageserver {} : {e}", node_id + ); + } + Ok(()) => { + let scheduled_to = tenant_shard.intent.get_attached(); + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Rescheduled shard while draining node {}: {} -> {:?}", + node_id, + node_id, + scheduled_to + ); + + let waiter = self.maybe_reconcile_shard(tenant_shard, nodes); + if let Some(some) = waiter { + waiters.push(some); + } + } + } + } + + last_inspected_shard = Some(*tid); + } + } + + waiters = self + .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await; + } + + while !waiters.is_empty() { + tracing::info!("Awaiting {} pending drain reconciliations", waiters.len()); + + waiters = self + .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await; + } + + // At this point we have done the best we could to drain shards from this node. + // Set the node scheduling policy to `[NodeSchedulingPolicy::PauseForRestart]` + // to complete the drain. + if let Err(err) = self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::PauseForRestart)) + .await + { + // This is not fatal. Anything that is polling the node scheduling policy to detect + // the end of the drain operations will hang, but all such places should enforce an + // overall timeout. The scheduling policy will be updated upon node re-attach and/or + // by the counterpart fill operation. + return Err(OperationError::FinalizeError( + format!( + "Failed to finalise drain of {node_id} by setting scheduling policy to PauseForRestart: {err}" + ) + .into(), + )); + } + + Ok(()) + } + + /// Create a node fill plan (pick secondaries to promote) that meets the following requirements: + /// 1. The node should be filled until it reaches the expected cluster average of + /// attached shards. If there are not enough secondaries on the node, the plan stops early. + /// 2. Select tenant shards to promote such that the number of attached shards is balanced + /// throughout the cluster. We achieve this by picking tenant shards from each node, + /// starting from the ones with the largest number of attached shards, until the node + /// reaches the expected cluster average. + fn fill_node_plan(&self, node_id: NodeId) -> Vec { + let mut locked = self.inner.write().unwrap(); + let fill_requirement = locked.scheduler.compute_fill_requirement(node_id); + + let mut tids_by_node = locked + .tenants + .iter_mut() + .filter_map(|(tid, tenant_shard)| { + if tenant_shard.intent.get_secondary().contains(&node_id) { + if let Some(primary) = tenant_shard.intent.get_attached() { + return Some((*primary, *tid)); + } + } + + None + }) + .into_group_map(); + + let expected_attached = locked.scheduler.expected_attached_shard_count(); + let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count(); + + let mut plan = Vec::new(); + for (node_id, attached) in nodes_by_load { + if plan.len() >= fill_requirement + || tids_by_node.is_empty() + || attached <= expected_attached + { + break; + } + + let can_take = attached - expected_attached; + let mut remove_node = false; + for _ in 0..can_take { + match tids_by_node.get_mut(&node_id) { + Some(tids) => match tids.pop() { + Some(tid) => { + plan.push(tid); + } + None => { + remove_node = true; + break; + } + }, + None => { + break; + } + } + } + + if remove_node { + tids_by_node.remove(&node_id); + } + } + + plan + } + + /// Fill a node by promoting its secondaries until the cluster is balanced + /// with regards to attached shard counts. Note that this operation only + /// makes sense as a counterpart to the drain implemented in [`Service::drain_node`]. + /// This is a long running operation and it should run as a separate Tokio task. + pub(crate) async fn fill_node( + &self, + node_id: NodeId, + cancel: CancellationToken, + ) -> Result<(), OperationError> { + // TODO(vlad): Currently this operates on the assumption that all + // secondaries are warm. This is not always true (e.g. we just migrated the + // tenant). Take that into consideration by checking the secondary status. + let mut tids_to_promote = self.fill_node_plan(node_id); + + let mut waiters = Vec::new(); + let mut schedule_context = ScheduleContext::default(); + + // Execute the plan we've composed above. Before aplying each move from the plan, + // we validate to ensure that it has not gone stale in the meantime. + while !tids_to_promote.is_empty() { + if cancel.is_cancelled() { + return Err(OperationError::Cancelled); + } + + { + let mut locked = self.inner.write().unwrap(); + let (nodes, tenants, scheduler) = locked.parts_mut(); + + let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged( + format!("node {node_id} was removed").into(), + ))?; + + let current_policy = node.get_scheduling(); + if !matches!(current_policy, NodeSchedulingPolicy::Filling) { + // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think + // about it + return Err(OperationError::NodeStateChanged( + format!("node {node_id} changed state to {current_policy:?}").into(), + )); + } + + while waiters.len() < MAX_RECONCILES_PER_OPERATION { + if let Some(tid) = tids_to_promote.pop() { + if let Some(tenant_shard) = tenants.get_mut(&tid) { + // If the node being filled is not a secondary anymore, + // skip the promotion. + if !tenant_shard.intent.get_secondary().contains(&node_id) { + continue; + } + + let previously_attached_to = *tenant_shard.intent.get_attached(); + + tenant_shard.intent.promote_attached(scheduler, node_id); + match tenant_shard.schedule(scheduler, &mut schedule_context) { + Err(e) => { + tracing::warn!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Scheduling error when filling pageserver {} : {e}", node_id + ); + } + Ok(()) => { + tracing::info!( + tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(), + "Rescheduled shard while filling node {}: {:?} -> {}", + node_id, + previously_attached_to, + node_id + ); + + if let Some(waiter) = + self.maybe_reconcile_shard(tenant_shard, nodes) + { + waiters.push(waiter); + } + } + } + } + } else { + break; + } + } + } + + waiters = self + .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await; + } + + while !waiters.is_empty() { + tracing::info!("Awaiting {} pending fill reconciliations", waiters.len()); + + waiters = self + .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await; + } + + if let Err(err) = self + .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active)) + .await + { + // This isn't a huge issue since the filling process starts upon request. However, it + // will prevent the next drain from starting. The only case in which this can fail + // is database unavailability. Such a case will require manual intervention. + return Err(OperationError::FinalizeError( + format!("Failed to finalise fill of {node_id} by setting scheduling policy to Active: {err}") + .into(), + )); + } + + Ok(()) + } +} diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs new file mode 100644 index 0000000000..d1b632755f --- /dev/null +++ b/storage_controller/src/tenant_shard.rs @@ -0,0 +1,1668 @@ +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, + time::Duration, +}; + +use crate::{ + metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, + persistence::TenantShardPersistence, + reconciler::ReconcileUnits, + scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext}, +}; +use pageserver_api::controller_api::{ + NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, +}; +use pageserver_api::{ + models::{LocationConfig, LocationConfigMode, TenantConfig}, + shard::{ShardIdentity, TenantShardId}, +}; +use serde::Serialize; +use tokio::task::JoinHandle; +use tokio_util::sync::CancellationToken; +use tracing::{instrument, Instrument}; +use utils::{ + generation::Generation, + id::NodeId, + seqwait::{SeqWait, SeqWaitError}, + sync::gate::GateGuard, +}; + +use crate::{ + compute_hook::ComputeHook, + node::Node, + persistence::{split_state::SplitState, Persistence}, + reconciler::{ + attached_location_conf, secondary_location_conf, ReconcileError, Reconciler, TargetState, + }, + scheduler::{ScheduleError, Scheduler}, + service, Sequence, +}; + +/// Serialization helper +fn read_last_error(v: &std::sync::Mutex>, serializer: S) -> Result +where + S: serde::ser::Serializer, + T: std::fmt::Display, +{ + serializer.collect_str( + &v.lock() + .unwrap() + .as_ref() + .map(|e| format!("{e}")) + .unwrap_or("".to_string()), + ) +} + +/// In-memory state for a particular tenant shard. +/// +/// This struct implement Serialize for debugging purposes, but is _not_ persisted +/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted. +#[derive(Serialize)] +pub(crate) struct TenantShard { + pub(crate) tenant_shard_id: TenantShardId, + + pub(crate) shard: ShardIdentity, + + // Runtime only: sequence used to coordinate when updating this object while + // with background reconcilers may be running. A reconciler runs to a particular + // sequence. + pub(crate) sequence: Sequence, + + // Latest generation number: next time we attach, increment this + // and use the incremented number when attaching. + // + // None represents an incompletely onboarded tenant via the [`Service::location_config`] + // API, where this tenant may only run in PlacementPolicy::Secondary. + pub(crate) generation: Option, + + // High level description of how the tenant should be set up. Provided + // externally. + pub(crate) policy: PlacementPolicy, + + // Low level description of exactly which pageservers should fulfil + // which role. Generated by `Self::schedule`. + pub(crate) intent: IntentState, + + // Low level description of how the tenant is configured on pageservers: + // if this does not match `Self::intent` then the tenant needs reconciliation + // with `Self::reconcile`. + pub(crate) observed: ObservedState, + + // Tenant configuration, passed through opaquely to the pageserver. Identical + // for all shards in a tenant. + pub(crate) config: TenantConfig, + + /// If a reconcile task is currently in flight, it may be joined here (it is + /// only safe to join if either the result has been received or the reconciler's + /// cancellation token has been fired) + #[serde(skip)] + pub(crate) reconciler: Option, + + /// If a tenant is being split, then all shards with that TenantId will have a + /// SplitState set, this acts as a guard against other operations such as background + /// reconciliation, and timeline creation. + pub(crate) splitting: SplitState, + + /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag + /// is set. This flag is cleared when the tenant is popped off the delay queue. + pub(crate) delayed_reconcile: bool, + + /// Optionally wait for reconciliation to complete up to a particular + /// sequence number. + #[serde(skip)] + pub(crate) waiter: std::sync::Arc>, + + /// Indicates sequence number for which we have encountered an error reconciling. If + /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred, + /// and callers should stop waiting for `waiter` and propagate the error. + #[serde(skip)] + pub(crate) error_waiter: std::sync::Arc>, + + /// The most recent error from a reconcile on this tenant. This is a nested Arc + /// because: + /// - ReconcileWaiters need to Arc-clone the overall object to read it later + /// - ReconcileWaitError needs to use an `Arc` because we can construct + /// many waiters for one shard, and the underlying error types are not Clone. + /// TODO: generalize to an array of recent events + /// TOOD: use a ArcSwap instead of mutex for faster reads? + #[serde(serialize_with = "read_last_error")] + pub(crate) last_error: std::sync::Arc>>>, + + /// If we have a pending compute notification that for some reason we weren't able to send, + /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes + /// and trigger a Reconciler run. This is the mechanism by which compute notifications are included in the scope + /// of state that we publish externally in an eventually consistent way. + pub(crate) pending_compute_notification: bool, + + // Support/debug tool: if something is going wrong or flapping with scheduling, this may + // be set to a non-active state to avoid making changes while the issue is fixed. + scheduling_policy: ShardSchedulingPolicy, +} + +#[derive(Default, Clone, Debug, Serialize)] +pub(crate) struct IntentState { + attached: Option, + secondary: Vec, +} + +impl IntentState { + pub(crate) fn new() -> Self { + Self { + attached: None, + secondary: vec![], + } + } + pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option) -> Self { + if let Some(node_id) = node_id { + scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach); + } + Self { + attached: node_id, + secondary: vec![], + } + } + + pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option) { + if self.attached != new_attached { + if let Some(old_attached) = self.attached.take() { + scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + } + if let Some(new_attached) = &new_attached { + scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach); + } + self.attached = new_attached; + } + } + + /// Like set_attached, but the node is from [`Self::secondary`]. This swaps the node from + /// secondary to attached while maintaining the scheduler's reference counts. + pub(crate) fn promote_attached( + &mut self, + scheduler: &mut Scheduler, + promote_secondary: NodeId, + ) { + // If we call this with a node that isn't in secondary, it would cause incorrect + // scheduler reference counting, since we assume the node is already referenced as a secondary. + debug_assert!(self.secondary.contains(&promote_secondary)); + + self.secondary.retain(|n| n != &promote_secondary); + + let demoted = self.attached; + self.attached = Some(promote_secondary); + + scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary); + if let Some(demoted) = demoted { + scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached); + } + } + + pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) { + debug_assert!(!self.secondary.contains(&new_secondary)); + scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary); + self.secondary.push(new_secondary); + } + + /// It is legal to call this with a node that is not currently a secondary: that is a no-op + pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) { + let index = self.secondary.iter().position(|n| *n == node_id); + if let Some(index) = index { + scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + self.secondary.remove(index); + } + } + + pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) { + for secondary in self.secondary.drain(..) { + scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary); + } + } + + /// Remove the last secondary node from the list of secondaries + pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) { + if let Some(node_id) = self.secondary.pop() { + scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); + } + } + + pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) { + if let Some(old_attached) = self.attached.take() { + scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); + } + + self.clear_secondary(scheduler); + } + + pub(crate) fn all_pageservers(&self) -> Vec { + let mut result = Vec::new(); + if let Some(p) = self.attached { + result.push(p) + } + + result.extend(self.secondary.iter().copied()); + + result + } + + pub(crate) fn get_attached(&self) -> &Option { + &self.attached + } + + pub(crate) fn get_secondary(&self) -> &Vec { + &self.secondary + } + + /// If the node is in use as the attached location, demote it into + /// the list of secondary locations. This is used when a node goes offline, + /// and we want to use a different node for attachment, but not permanently + /// forget the location on the offline node. + /// + /// Returns true if a change was made + pub(crate) fn demote_attached(&mut self, scheduler: &mut Scheduler, node_id: NodeId) -> bool { + if self.attached == Some(node_id) { + self.attached = None; + self.secondary.push(node_id); + scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached); + true + } else { + false + } + } +} + +impl Drop for IntentState { + fn drop(&mut self) { + // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler. + // We do not check this while panicking, to avoid polluting unit test failures or + // other assertions with this assertion's output. It's still wrong to leak these, + // but if we already have a panic then we don't need to independently flag this case. + if !(std::thread::panicking()) { + debug_assert!(self.attached.is_none() && self.secondary.is_empty()); + } + } +} + +#[derive(Default, Clone, Serialize)] +pub(crate) struct ObservedState { + pub(crate) locations: HashMap, +} + +/// Our latest knowledge of how this tenant is configured in the outside world. +/// +/// Meaning: +/// * No instance of this type exists for a node: we are certain that we have nothing configured on that +/// node for this shard. +/// * Instance exists with conf==None: we *might* have some state on that node, but we don't know +/// what it is (e.g. we failed partway through configuring it) +/// * Instance exists with conf==Some: this tells us what we last successfully configured on this node, +/// and that configuration will still be present unless something external interfered. +#[derive(Clone, Serialize)] +pub(crate) struct ObservedStateLocation { + /// If None, it means we do not know the status of this shard's location on this node, but + /// we know that we might have some state on this node. + pub(crate) conf: Option, +} +pub(crate) struct ReconcilerWaiter { + // For observability purposes, remember the ID of the shard we're + // waiting for. + pub(crate) tenant_shard_id: TenantShardId, + + seq_wait: std::sync::Arc>, + error_seq_wait: std::sync::Arc>, + error: std::sync::Arc>>>, + seq: Sequence, +} + +pub(crate) enum ReconcilerStatus { + Done, + Failed, + InProgress, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum ReconcileWaitError { + #[error("Timeout waiting for shard {0}")] + Timeout(TenantShardId), + #[error("shutting down")] + Shutdown, + #[error("Reconcile error on shard {0}: {1}")] + Failed(TenantShardId, Arc), +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct ReplaceSecondary { + old_node_id: NodeId, + new_node_id: NodeId, +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct MigrateAttachment { + pub(crate) old_attached_node_id: NodeId, + pub(crate) new_attached_node_id: NodeId, +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) enum ScheduleOptimizationAction { + // Replace one of our secondary locations with a different node + ReplaceSecondary(ReplaceSecondary), + // Migrate attachment to an existing secondary location + MigrateAttachment(MigrateAttachment), +} + +#[derive(Eq, PartialEq, Debug)] +pub(crate) struct ScheduleOptimization { + // What was the reconcile sequence when we generated this optimization? The optimization + // should only be applied if the shard's sequence is still at this value, in case other changes + // happened between planning the optimization and applying it. + sequence: Sequence, + + pub(crate) action: ScheduleOptimizationAction, +} + +impl ReconcilerWaiter { + pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> { + tokio::select! { + result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> { + result.map_err(|e| match e { + SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id), + SeqWaitError::Shutdown => ReconcileWaitError::Shutdown + })?; + }, + result = self.error_seq_wait.wait_for(self.seq) => { + result.map_err(|e| match e { + SeqWaitError::Shutdown => ReconcileWaitError::Shutdown, + SeqWaitError::Timeout => unreachable!() + })?; + + return Err(ReconcileWaitError::Failed(self.tenant_shard_id, + self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone())) + } + } + + Ok(()) + } + + pub(crate) fn get_status(&self) -> ReconcilerStatus { + if self.seq_wait.would_wait_for(self.seq).is_err() { + ReconcilerStatus::Done + } else if self.error_seq_wait.would_wait_for(self.seq).is_err() { + ReconcilerStatus::Failed + } else { + ReconcilerStatus::InProgress + } + } +} + +/// Having spawned a reconciler task, the tenant shard's state will carry enough +/// information to optionally cancel & await it later. +pub(crate) struct ReconcilerHandle { + sequence: Sequence, + handle: JoinHandle<()>, + cancel: CancellationToken, +} + +pub(crate) enum ReconcileNeeded { + /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler + /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it) + No, + /// shard has a reconciler running, and its intent hasn't changed since that one was + /// spawned: wait for the existing reconciler rather than spawning a new one. + WaitExisting(ReconcilerWaiter), + /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`] + Yes, +} + +/// When a reconcile task completes, it sends this result object +/// to be applied to the primary TenantShard. +pub(crate) struct ReconcileResult { + pub(crate) sequence: Sequence, + /// On errors, `observed` should be treated as an incompleted description + /// of state (i.e. any nodes present in the result should override nodes + /// present in the parent tenant state, but any unmentioned nodes should + /// not be removed from parent tenant state) + pub(crate) result: Result<(), ReconcileError>, + + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) generation: Option, + pub(crate) observed: ObservedState, + + /// Set [`TenantShard::pending_compute_notification`] from this flag + pub(crate) pending_compute_notification: bool, +} + +impl ObservedState { + pub(crate) fn new() -> Self { + Self { + locations: HashMap::new(), + } + } +} + +impl TenantShard { + pub(crate) fn new( + tenant_shard_id: TenantShardId, + shard: ShardIdentity, + policy: PlacementPolicy, + ) -> Self { + Self { + tenant_shard_id, + policy, + intent: IntentState::default(), + generation: Some(Generation::new(0)), + shard, + observed: ObservedState::default(), + config: TenantConfig::default(), + reconciler: None, + splitting: SplitState::Idle, + sequence: Sequence(1), + delayed_reconcile: false, + waiter: Arc::new(SeqWait::new(Sequence(0))), + error_waiter: Arc::new(SeqWait::new(Sequence(0))), + last_error: Arc::default(), + pending_compute_notification: false, + scheduling_policy: ShardSchedulingPolicy::default(), + } + } + + /// For use on startup when learning state from pageservers: generate my [`IntentState`] from my + /// [`ObservedState`], even if it violates my [`PlacementPolicy`]. Call [`Self::schedule`] next, + /// to get an intent state that complies with placement policy. The overall goal is to do scheduling + /// in a way that makes use of any configured locations that already exist in the outside world. + pub(crate) fn intent_from_observed(&mut self, scheduler: &mut Scheduler) { + // Choose an attached location by filtering observed locations, and then sorting to get the highest + // generation + let mut attached_locs = self + .observed + .locations + .iter() + .filter_map(|(node_id, l)| { + if let Some(conf) = &l.conf { + if conf.mode == LocationConfigMode::AttachedMulti + || conf.mode == LocationConfigMode::AttachedSingle + || conf.mode == LocationConfigMode::AttachedStale + { + Some((node_id, conf.generation)) + } else { + None + } + } else { + None + } + }) + .collect::>(); + + attached_locs.sort_by_key(|i| i.1); + if let Some((node_id, _gen)) = attached_locs.into_iter().last() { + self.intent.set_attached(scheduler, Some(*node_id)); + } + + // All remaining observed locations generate secondary intents. This includes None + // observations, as these may well have some local content on disk that is usable (this + // is an edge case that might occur if we restarted during a migration or other change) + // + // We may leave intent.attached empty if we didn't find any attached locations: [`Self::schedule`] + // will take care of promoting one of these secondaries to be attached. + self.observed.locations.keys().for_each(|node_id| { + if Some(*node_id) != self.intent.attached { + self.intent.push_secondary(scheduler, *node_id); + } + }); + } + + /// Part of [`Self::schedule`] that is used to choose exactly one node to act as the + /// attached pageserver for a shard. + /// + /// Returns whether we modified it, and the NodeId selected. + fn schedule_attached( + &mut self, + scheduler: &mut Scheduler, + context: &ScheduleContext, + ) -> Result<(bool, NodeId), ScheduleError> { + // No work to do if we already have an attached tenant + if let Some(node_id) = self.intent.attached { + return Ok((false, node_id)); + } + + if let Some(promote_secondary) = scheduler.node_preferred(&self.intent.secondary) { + // Promote a secondary + tracing::debug!("Promoted secondary {} to attached", promote_secondary); + self.intent.promote_attached(scheduler, promote_secondary); + Ok((true, promote_secondary)) + } else { + // Pick a fresh node: either we had no secondaries or none were schedulable + let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?; + tracing::debug!("Selected {} as attached", node_id); + self.intent.set_attached(scheduler, Some(node_id)); + Ok((true, node_id)) + } + } + + pub(crate) fn schedule( + &mut self, + scheduler: &mut Scheduler, + context: &mut ScheduleContext, + ) -> Result<(), ScheduleError> { + let r = self.do_schedule(scheduler, context); + + context.avoid(&self.intent.all_pageservers()); + if let Some(attached) = self.intent.get_attached() { + context.push_attached(*attached); + } + + r + } + + pub(crate) fn do_schedule( + &mut self, + scheduler: &mut Scheduler, + context: &ScheduleContext, + ) -> Result<(), ScheduleError> { + // TODO: before scheduling new nodes, check if any existing content in + // self.intent refers to pageservers that are offline, and pick other + // pageservers if so. + + // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not + // change their attach location. + + match self.scheduling_policy { + ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {} + ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => { + // Warn to make it obvious why other things aren't happening/working, if we skip scheduling + tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), + "Scheduling is disabled by policy {:?}", self.scheduling_policy); + return Ok(()); + } + } + + // Build the set of pageservers already in use by this tenant, to avoid scheduling + // more work on the same pageservers we're already using. + let mut modified = false; + + // Add/remove nodes to fulfil policy + use PlacementPolicy::*; + match self.policy { + Attached(secondary_count) => { + let retain_secondaries = if self.intent.attached.is_none() + && scheduler.node_preferred(&self.intent.secondary).is_some() + { + // If we have no attached, and one of the secondaries is elegible to be promoted, retain + // one more secondary than we usually would, as one of them will become attached futher down this function. + secondary_count + 1 + } else { + secondary_count + }; + + while self.intent.secondary.len() > retain_secondaries { + // We have no particular preference for one secondary location over another: just + // arbitrarily drop from the end + self.intent.pop_secondary(scheduler); + modified = true; + } + + // Should have exactly one attached, and N secondaries + let (modified_attached, attached_node_id) = + self.schedule_attached(scheduler, context)?; + modified |= modified_attached; + + let mut used_pageservers = vec![attached_node_id]; + while self.intent.secondary.len() < secondary_count { + let node_id = scheduler.schedule_shard(&used_pageservers, context)?; + self.intent.push_secondary(scheduler, node_id); + used_pageservers.push(node_id); + modified = true; + } + } + Secondary => { + if let Some(node_id) = self.intent.get_attached() { + // Populate secondary by demoting the attached node + self.intent.demote_attached(scheduler, *node_id); + modified = true; + } else if self.intent.secondary.is_empty() { + // Populate secondary by scheduling a fresh node + let node_id = scheduler.schedule_shard(&[], context)?; + self.intent.push_secondary(scheduler, node_id); + modified = true; + } + while self.intent.secondary.len() > 1 { + // We have no particular preference for one secondary location over another: just + // arbitrarily drop from the end + self.intent.pop_secondary(scheduler); + modified = true; + } + } + Detached => { + // Never add locations in this mode + if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() { + self.intent.clear(scheduler); + modified = true; + } + } + } + + if modified { + self.sequence.0 += 1; + } + + Ok(()) + } + + /// Optimize attachments: if a shard has a secondary location that is preferable to + /// its primary location based on soft constraints, switch that secondary location + /// to be attached. + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn optimize_attachment( + &self, + nodes: &HashMap, + schedule_context: &ScheduleContext, + ) -> Option { + let attached = (*self.intent.get_attached())?; + if self.intent.secondary.is_empty() { + // We can only do useful work if we have both attached and secondary locations: this + // function doesn't schedule new locations, only swaps between attached and secondaries. + return None; + } + + let current_affinity_score = schedule_context.get_node_affinity(attached); + let current_attachment_count = schedule_context.get_node_attachments(attached); + + // Generate score for each node, dropping any un-schedulable nodes. + let all_pageservers = self.intent.all_pageservers(); + let mut scores = all_pageservers + .iter() + .flat_map(|node_id| { + let node = nodes.get(node_id); + if node.is_none() { + None + } else if matches!( + node.unwrap().get_scheduling(), + NodeSchedulingPolicy::Filling + ) { + // If the node is currently filling, don't count it as a candidate to avoid, + // racing with the background fill. + None + } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) { + None + } else { + let affinity_score = schedule_context.get_node_affinity(*node_id); + let attachment_count = schedule_context.get_node_attachments(*node_id); + Some((*node_id, affinity_score, attachment_count)) + } + }) + .collect::>(); + + // Sort precedence: + // 1st - prefer nodes with the lowest total affinity score + // 2nd - prefer nodes with the lowest number of attachments in this context + // 3rd - if all else is equal, sort by node ID for determinism in tests. + scores.sort_by_key(|i| (i.1, i.2, i.0)); + + if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) = + scores.first() + { + if attached != *preferred_node { + // The best alternative must be more than 1 better than us, otherwise we could end + // up flapping back next time we're called (e.g. there's no point migrating from + // a location with score 1 to a score zero, because on next location the situation + // would be the same, but in reverse). + if current_affinity_score > *preferred_affinity_score + AffinityScore(1) + || current_attachment_count > *preferred_attachment_count + 1 + { + tracing::info!( + "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})", + self.intent.get_secondary() + ); + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: attached, + new_attached_node_id: *preferred_node, + }), + }); + } + } else { + tracing::debug!( + "Node {} is already preferred (score {:?})", + preferred_node, + preferred_affinity_score + ); + } + } + + // Fall-through: we didn't find an optimization + None + } + + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn optimize_secondary( + &self, + scheduler: &Scheduler, + schedule_context: &ScheduleContext, + ) -> Option { + if self.intent.secondary.is_empty() { + // We can only do useful work if we have both attached and secondary locations: this + // function doesn't schedule new locations, only swaps between attached and secondaries. + return None; + } + + for secondary in self.intent.get_secondary() { + let Some(affinity_score) = schedule_context.nodes.get(secondary) else { + // We're already on a node unaffected any affinity constraints, + // so we won't change it. + continue; + }; + + // Let the scheduler suggest a node, where it would put us if we were scheduling afresh + // This implicitly limits the choice to nodes that are available, and prefers nodes + // with lower utilization. + let Ok(candidate_node) = + scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context) + else { + // A scheduling error means we have no possible candidate replacements + continue; + }; + + let candidate_affinity_score = schedule_context + .nodes + .get(&candidate_node) + .unwrap_or(&AffinityScore::FREE); + + // The best alternative must be more than 1 better than us, otherwise we could end + // up flapping back next time we're called. + if *candidate_affinity_score + AffinityScore(1) < *affinity_score { + // If some other node is available and has a lower score than this node, then + // that other node is a good place to migrate to. + tracing::info!( + "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})", + self.intent.get_secondary() + ); + return Some(ScheduleOptimization { + sequence: self.sequence, + action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: *secondary, + new_node_id: candidate_node, + }), + }); + } + } + + None + } + + /// Return true if the optimization was really applied: it will not be applied if the optimization's + /// sequence is behind this tenant shard's + pub(crate) fn apply_optimization( + &mut self, + scheduler: &mut Scheduler, + optimization: ScheduleOptimization, + ) -> bool { + if optimization.sequence != self.sequence { + return false; + } + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_schedule_optimization + .inc(); + + match optimization.action { + ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id, + new_attached_node_id, + }) => { + self.intent.demote_attached(scheduler, old_attached_node_id); + self.intent + .promote_attached(scheduler, new_attached_node_id); + } + ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id, + new_node_id, + }) => { + self.intent.remove_secondary(scheduler, old_node_id); + self.intent.push_secondary(scheduler, new_node_id); + } + } + + true + } + + /// Query whether the tenant's observed state for attached node matches its intent state, and if so, + /// yield the node ID. This is appropriate for emitting compute hook notifications: we are checking that + /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there. + /// + /// Reconciliation may still be needed for other aspects of state such as secondaries (see [`Self::dirty`]): this + /// funciton should not be used to decide whether to reconcile. + pub(crate) fn stably_attached(&self) -> Option { + if let Some(attach_intent) = self.intent.attached { + match self.observed.locations.get(&attach_intent) { + Some(loc) => match &loc.conf { + Some(conf) => match conf.mode { + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale => { + // Our intent and observed state agree that this node is in an attached state. + Some(attach_intent) + } + // Our observed config is not an attached state + _ => None, + }, + // Our observed state is None, i.e. in flux + None => None, + }, + // We have no observed state for this node + None => None, + } + } else { + // Our intent is not to attach + None + } + } + + fn dirty(&self, nodes: &Arc>) -> bool { + let mut dirty_nodes = HashSet::new(); + + if let Some(node_id) = self.intent.attached { + // Maybe panic: it is a severe bug if we try to attach while generation is null. + let generation = self + .generation + .expect("Attempted to enter attached state without a generation"); + + let wanted_conf = attached_location_conf( + generation, + &self.shard, + &self.config, + !self.intent.secondary.is_empty(), + ); + match self.observed.locations.get(&node_id) { + Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} + Some(_) | None => { + dirty_nodes.insert(node_id); + } + } + } + + for node_id in &self.intent.secondary { + let wanted_conf = secondary_location_conf(&self.shard, &self.config); + match self.observed.locations.get(node_id) { + Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} + Some(_) | None => { + dirty_nodes.insert(*node_id); + } + } + } + + for node_id in self.observed.locations.keys() { + if self.intent.attached != Some(*node_id) && !self.intent.secondary.contains(node_id) { + // We have observed state that isn't part of our intent: need to clean it up. + dirty_nodes.insert(*node_id); + } + } + + dirty_nodes.retain(|node_id| { + nodes + .get(node_id) + .map(|n| n.is_available()) + .unwrap_or(false) + }); + + !dirty_nodes.is_empty() + } + + #[allow(clippy::too_many_arguments)] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn get_reconcile_needed( + &mut self, + pageservers: &Arc>, + ) -> ReconcileNeeded { + // If there are any ambiguous observed states, and the nodes they refer to are available, + // we should reconcile to clean them up. + let mut dirty_observed = false; + for (node_id, observed_loc) in &self.observed.locations { + let node = pageservers + .get(node_id) + .expect("Nodes may not be removed while referenced"); + if observed_loc.conf.is_none() && node.is_available() { + dirty_observed = true; + break; + } + } + + let active_nodes_dirty = self.dirty(pageservers); + + // Even if there is no pageserver work to be done, if we have a pending notification to computes, + // wake up a reconciler to send it. + let do_reconcile = + active_nodes_dirty || dirty_observed || self.pending_compute_notification; + + if !do_reconcile { + tracing::debug!("Not dirty, no reconciliation needed."); + return ReconcileNeeded::No; + } + + // If we are currently splitting, then never start a reconciler task: the splitting logic + // requires that shards are not interfered with while it runs. Do this check here rather than + // up top, so that we only log this message if we would otherwise have done a reconciliation. + if !matches!(self.splitting, SplitState::Idle) { + tracing::info!("Refusing to reconcile, splitting in progress"); + return ReconcileNeeded::No; + } + + // Reconcile already in flight for the current sequence? + if let Some(handle) = &self.reconciler { + if handle.sequence == self.sequence { + tracing::info!( + "Reconciliation already in progress for sequence {:?}", + self.sequence, + ); + return ReconcileNeeded::WaitExisting(ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + }); + } + } + + // Pre-checks done: finally check whether we may actually do the work + match self.scheduling_policy { + ShardSchedulingPolicy::Active + | ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause => {} + ShardSchedulingPolicy::Stop => { + // We only reach this point if there is work to do and we're going to skip + // doing it: warn it obvious why this tenant isn't doing what it ought to. + tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy); + return ReconcileNeeded::No; + } + } + + ReconcileNeeded::Yes + } + + /// Ensure the sequence number is set to a value where waiting for this value will make us wait + /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers. + /// + /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property + /// that the waiter will not complete until some future Reconciler is constructed and run. + fn ensure_sequence_ahead(&mut self) { + // Find the highest sequence for which a Reconciler has previously run or is currently + // running + let max_seen = std::cmp::max( + self.reconciler + .as_ref() + .map(|r| r.sequence) + .unwrap_or(Sequence(0)), + std::cmp::max(self.waiter.load(), self.error_waiter.load()), + ); + + if self.sequence <= max_seen { + self.sequence = max_seen.next(); + } + } + + /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet. + /// + /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but + /// you would like to wait on the next reconciler that gets spawned in the background. + pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter { + self.ensure_sequence_ahead(); + + ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + } + } + + #[allow(clippy::too_many_arguments)] + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] + pub(crate) fn spawn_reconciler( + &mut self, + result_tx: &tokio::sync::mpsc::UnboundedSender, + pageservers: &Arc>, + compute_hook: &Arc, + service_config: &service::Config, + persistence: &Arc, + units: ReconcileUnits, + gate_guard: GateGuard, + cancel: &CancellationToken, + ) -> Option { + // Reconcile in flight for a stale sequence? Our sequence's task will wait for it before + // doing our sequence's work. + let old_handle = self.reconciler.take(); + + // Build list of nodes from which the reconciler should detach + let mut detach = Vec::new(); + for node_id in self.observed.locations.keys() { + if self.intent.get_attached() != &Some(*node_id) + && !self.intent.secondary.contains(node_id) + { + detach.push( + pageservers + .get(node_id) + .expect("Intent references non-existent pageserver") + .clone(), + ) + } + } + + // Advance the sequence before spawning a reconciler, so that sequence waiters + // can distinguish between before+after the reconcile completes. + self.ensure_sequence_ahead(); + + let reconciler_cancel = cancel.child_token(); + let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); + let mut reconciler = Reconciler { + tenant_shard_id: self.tenant_shard_id, + shard: self.shard, + generation: self.generation, + intent: reconciler_intent, + detach, + config: self.config.clone(), + observed: self.observed.clone(), + compute_hook: compute_hook.clone(), + service_config: service_config.clone(), + _gate_guard: gate_guard, + _resource_units: units, + cancel: reconciler_cancel.clone(), + persistence: persistence.clone(), + compute_notify_failure: false, + }; + + let reconcile_seq = self.sequence; + + tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); + let must_notify = self.pending_compute_notification; + let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq, + tenant_id=%reconciler.tenant_shard_id.tenant_id, + shard_id=%reconciler.tenant_shard_id.shard_slug()); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_spawn + .inc(); + let result_tx = result_tx.clone(); + let join_handle = tokio::task::spawn( + async move { + // Wait for any previous reconcile task to complete before we start + if let Some(old_handle) = old_handle { + old_handle.cancel.cancel(); + if let Err(e) = old_handle.handle.await { + // We can't do much with this other than log it: the task is done, so + // we may proceed with our work. + tracing::error!("Unexpected join error waiting for reconcile task: {e}"); + } + } + + // Early check for cancellation before doing any work + // TODO: wrap all remote API operations in cancellation check + // as well. + if reconciler.cancel.is_cancelled() { + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: ReconcileOutcome::Cancel, + }); + return; + } + + // Attempt to make observed state match intent state + let result = reconciler.reconcile().await; + + // If we know we had a pending compute notification from some previous action, send a notification irrespective + // of whether the above reconcile() did any work + if result.is_ok() && must_notify { + // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] + reconciler.compute_notify().await.ok(); + } + + // Update result counter + let outcome_label = match &result { + Ok(_) => ReconcileOutcome::Success, + Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, + Err(_) => ReconcileOutcome::Error, + }; + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: outcome_label, + }); + + // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might + // try and schedule more work in response to our result. + let result = ReconcileResult { + sequence: reconcile_seq, + result, + tenant_shard_id: reconciler.tenant_shard_id, + generation: reconciler.generation, + observed: reconciler.observed, + pending_compute_notification: reconciler.compute_notify_failure, + }; + + result_tx.send(result).ok(); + } + .instrument(reconciler_span), + ); + + self.reconciler = Some(ReconcilerHandle { + sequence: self.sequence, + handle: join_handle, + cancel: reconciler_cancel, + }); + + Some(ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + }) + } + + /// Get a waiter for any reconciliation in flight, but do not start reconciliation + /// if it is not already running + pub(crate) fn get_waiter(&self) -> Option { + if self.reconciler.is_some() { + Some(ReconcilerWaiter { + tenant_shard_id: self.tenant_shard_id, + seq_wait: self.waiter.clone(), + error_seq_wait: self.error_waiter.clone(), + error: self.last_error.clone(), + seq: self.sequence, + }) + } else { + None + } + } + + /// Called when a ReconcileResult has been emitted and the service is updating + /// our state: if the result is from a sequence >= my ReconcileHandle, then drop + /// the handle to indicate there is no longer a reconciliation in progress. + pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) { + if let Some(reconcile_handle) = &self.reconciler { + if reconcile_handle.sequence <= sequence { + self.reconciler = None; + } + } + } + + // If we had any state at all referring to this node ID, drop it. Does not + // attempt to reschedule. + pub(crate) fn deref_node(&mut self, node_id: NodeId) { + if self.intent.attached == Some(node_id) { + self.intent.attached = None; + } + + self.intent.secondary.retain(|n| n != &node_id); + + self.observed.locations.remove(&node_id); + + debug_assert!(!self.intent.all_pageservers().contains(&node_id)); + } + + pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) { + self.scheduling_policy = p; + } + + pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy { + &self.scheduling_policy + } + + pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) { + // Ordering: always set last_error before advancing sequence, so that sequence + // waiters are guaranteed to see a Some value when they see an error. + *(self.last_error.lock().unwrap()) = Some(Arc::new(error)); + self.error_waiter.advance(sequence); + } + + pub(crate) fn from_persistent( + tsp: TenantShardPersistence, + intent: IntentState, + ) -> anyhow::Result { + let tenant_shard_id = tsp.get_tenant_shard_id()?; + let shard_identity = tsp.get_shard_identity()?; + + Ok(Self { + tenant_shard_id, + shard: shard_identity, + sequence: Sequence::initial(), + generation: tsp.generation.map(|g| Generation::new(g as u32)), + policy: serde_json::from_str(&tsp.placement_policy).unwrap(), + intent, + observed: ObservedState::new(), + config: serde_json::from_str(&tsp.config).unwrap(), + reconciler: None, + splitting: tsp.splitting, + waiter: Arc::new(SeqWait::new(Sequence::initial())), + error_waiter: Arc::new(SeqWait::new(Sequence::initial())), + last_error: Arc::default(), + pending_compute_notification: false, + delayed_reconcile: false, + scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(), + }) + } + + pub(crate) fn to_persistent(&self) -> TenantShardPersistence { + TenantShardPersistence { + tenant_id: self.tenant_shard_id.tenant_id.to_string(), + shard_number: self.tenant_shard_id.shard_number.0 as i32, + shard_count: self.tenant_shard_id.shard_count.literal() as i32, + shard_stripe_size: self.shard.stripe_size.0 as i32, + generation: self.generation.map(|g| g.into().unwrap_or(0) as i32), + generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64), + placement_policy: serde_json::to_string(&self.policy).unwrap(), + config: serde_json::to_string(&self.config).unwrap(), + splitting: SplitState::default(), + scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(), + } + } +} + +#[cfg(test)] +pub(crate) mod tests { + use pageserver_api::{ + controller_api::NodeAvailability, + shard::{ShardCount, ShardNumber}, + }; + use utils::id::TenantId; + + use crate::scheduler::test_utils::make_test_nodes; + + use super::*; + + fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { + let tenant_id = TenantId::generate(); + let shard_number = ShardNumber(0); + let shard_count = ShardCount::new(1); + + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number, + shard_count, + }; + TenantShard::new( + tenant_shard_id, + ShardIdentity::new( + shard_number, + shard_count, + pageserver_api::shard::ShardStripeSize(32768), + ) + .unwrap(), + policy, + ) + } + + fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec { + let tenant_id = TenantId::generate(); + + (0..shard_count.count()) + .map(|i| { + let shard_number = ShardNumber(i); + + let tenant_shard_id = TenantShardId { + tenant_id, + shard_number, + shard_count, + }; + TenantShard::new( + tenant_shard_id, + ShardIdentity::new( + shard_number, + shard_count, + pageserver_api::shard::ShardStripeSize(32768), + ) + .unwrap(), + policy.clone(), + ) + }) + .collect() + } + + /// Test the scheduling behaviors used when a tenant configured for HA is subject + /// to nodes being marked offline. + #[test] + fn tenant_ha_scheduling() -> anyhow::Result<()> { + // Start with three nodes. Our tenant will only use two. The third one is + // expected to remain unused. + let mut nodes = make_test_nodes(3); + + let mut scheduler = Scheduler::new(nodes.values()); + let mut context = ScheduleContext::default(); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("we have enough nodes, scheduling should work"); + + // Expect to initially be schedule on to different nodes + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_some()); + + let attached_node_id = tenant_shard.intent.attached.unwrap(); + let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap(); + assert_ne!(attached_node_id, secondary_node_id); + + // Notifying the attached node is offline should demote it to a secondary + let changed = tenant_shard + .intent + .demote_attached(&mut scheduler, attached_node_id); + assert!(changed); + assert!(tenant_shard.intent.attached.is_none()); + assert_eq!(tenant_shard.intent.secondary.len(), 2); + + // Update the scheduler state to indicate the node is offline + nodes + .get_mut(&attached_node_id) + .unwrap() + .set_availability(NodeAvailability::Offline); + scheduler.node_upsert(nodes.get(&attached_node_id).unwrap()); + + // Scheduling the node should promote the still-available secondary node to attached + tenant_shard + .schedule(&mut scheduler, &mut context) + .expect("active nodes are available"); + assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id); + + // The original attached node should have been retained as a secondary + assert_eq!( + *tenant_shard.intent.secondary.iter().last().unwrap(), + attached_node_id + ); + + tenant_shard.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + fn intent_from_observed() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + tenant_shard.observed.locations.insert( + NodeId(3), + ObservedStateLocation { + conf: Some(LocationConfig { + mode: LocationConfigMode::AttachedMulti, + generation: Some(2), + secondary_conf: None, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, + tenant_conf: TenantConfig::default(), + }), + }, + ); + + tenant_shard.observed.locations.insert( + NodeId(2), + ObservedStateLocation { + conf: Some(LocationConfig { + mode: LocationConfigMode::AttachedStale, + generation: Some(1), + secondary_conf: None, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, + tenant_conf: TenantConfig::default(), + }), + }, + ); + + tenant_shard.intent_from_observed(&mut scheduler); + + // The highest generationed attached location gets used as attached + assert_eq!(tenant_shard.intent.attached, Some(NodeId(3))); + // Other locations get used as secondary + assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]); + + scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?; + + tenant_shard.intent.clear(&mut scheduler); + Ok(()) + } + + #[test] + fn scheduling_mode() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // In pause mode, schedule() shouldn't do anything + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; + assert!(tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok()); + assert!(tenant_shard.intent.all_pageservers().is_empty()); + + // In active mode, schedule() works + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; + assert!(tenant_shard + .schedule(&mut scheduler, &mut ScheduleContext::default()) + .is_ok()); + assert!(!tenant_shard.intent.all_pageservers().is_empty()); + + tenant_shard.intent.clear(&mut scheduler); + Ok(()) + } + + #[test] + fn optimize_attachment() -> anyhow::Result<()> { + let nodes = make_test_nodes(3); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // Initially: both nodes attached on shard 1, and both have secondary locations + // on different nodes. + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(2)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + + let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context); + + // Either shard should recognize that it has the option to switch to a secondary location where there + // would be no other shards from the same tenant, and request to do so. + assert_eq!( + optimization_a, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(2) + }) + }) + ); + + // Note that these optimizing two shards in the same tenant with the same ScheduleContext is + // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility + // of [`Service::optimize_all`] to avoid trying + // to do optimizations for multiple shards in the same tenant at the same time. Generating + // both optimizations is just done for test purposes + let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context); + assert_eq!( + optimization_b, + Some(ScheduleOptimization { + sequence: shard_b.sequence, + action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment { + old_attached_node_id: NodeId(1), + new_attached_node_id: NodeId(3) + }) + }) + ); + + // Applying these optimizations should result in the end state proposed + shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]); + shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap()); + assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3))); + assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + #[test] + fn optimize_secondary() -> anyhow::Result<()> { + let nodes = make_test_nodes(4); + let mut scheduler = Scheduler::new(nodes.values()); + + let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1)); + + // Initially: both nodes attached on shard 1, and both have secondary locations + // on different nodes. + shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1))); + shard_a.intent.push_secondary(&mut scheduler, NodeId(3)); + shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2))); + shard_b.intent.push_secondary(&mut scheduler, NodeId(3)); + + let mut schedule_context = ScheduleContext::default(); + schedule_context.avoid(&shard_a.intent.all_pageservers()); + schedule_context.push_attached(shard_a.intent.get_attached().unwrap()); + schedule_context.avoid(&shard_b.intent.all_pageservers()); + schedule_context.push_attached(shard_b.intent.get_attached().unwrap()); + + let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context); + + // Since there is a node with no locations available, the node with two locations for the + // same tenant should generate an optimization to move one away + assert_eq!( + optimization_a, + Some(ScheduleOptimization { + sequence: shard_a.sequence, + action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary { + old_node_id: NodeId(3), + new_node_id: NodeId(4) + }) + }) + ); + + shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap()); + assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1))); + assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]); + + shard_a.intent.clear(&mut scheduler); + shard_b.intent.clear(&mut scheduler); + + Ok(()) + } + + // Optimize til quiescent: this emulates what Service::optimize_all does, when + // called repeatedly in the background. + fn optimize_til_idle( + nodes: &HashMap, + scheduler: &mut Scheduler, + shards: &mut [TenantShard], + ) { + let mut loop_n = 0; + loop { + let mut schedule_context = ScheduleContext::default(); + let mut any_changed = false; + + for shard in shards.iter() { + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + } + + for shard in shards.iter_mut() { + let optimization = shard.optimize_attachment(nodes, &schedule_context); + if let Some(optimization) = optimization { + shard.apply_optimization(scheduler, optimization); + any_changed = true; + break; + } + + let optimization = shard.optimize_secondary(scheduler, &schedule_context); + if let Some(optimization) = optimization { + shard.apply_optimization(scheduler, optimization); + any_changed = true; + break; + } + } + + if !any_changed { + break; + } + + // Assert no infinite loop + loop_n += 1; + assert!(loop_n < 1000); + } + } + + /// Test the balancing behavior of shard scheduling: that it achieves a balance, and + /// that it converges. + #[test] + fn optimize_add_nodes() -> anyhow::Result<()> { + let nodes = make_test_nodes(4); + + // Only show the scheduler a couple of nodes + let mut scheduler = Scheduler::new([].iter()); + scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap()); + + let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4)); + let mut schedule_context = ScheduleContext::default(); + for shard in &mut shards { + assert!(shard + .schedule(&mut scheduler, &mut schedule_context) + .is_ok()); + } + + // We should see equal number of locations on the two nodes. + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); + // Scheduling does not consider the number of attachments picking the initial + // pageserver to attach to (hence the assertion that all primaries are on the + // same node) + // TODO: Tweak the scheduling to evenly distribute attachments for new shards. + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0); + + // Add another two nodes: we should see the shards spread out when their optimize + // methods are called + scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap()); + scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap()); + optimize_til_idle(&nodes, &mut scheduler, &mut shards); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1); + + for shard in shards.iter_mut() { + shard.intent.clear(&mut scheduler); + } + + Ok(()) + } +} diff --git a/s3_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml similarity index 80% rename from s3_scrubber/Cargo.toml rename to storage_scrubber/Cargo.toml index 4d136472e0..050be66483 100644 --- a/s3_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "s3_scrubber" +name = "storage_scrubber" version = "0.1.0" edition.workspace = true license.workspace = true @@ -11,6 +11,7 @@ either.workspace = true tokio-rustls.workspace = true anyhow.workspace = true hex.workspace = true +humantime.workspace = true thiserror.workspace = true rand.workspace = true bytes.workspace = true @@ -22,9 +23,17 @@ serde_with.workspace = true workspace_hack.workspace = true utils.workspace = true async-stream.workspace = true +tokio-postgres-rustls.workspace = true +postgres_ffi.workspace = true tokio-stream.workspace = true +tokio-postgres.workspace = true +tokio-util = { workspace = true } futures-util.workspace = true itertools.workspace = true +camino.workspace = true +rustls.workspace = true +rustls-native-certs.workspace = true +once_cell.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } diff --git a/s3_scrubber/README.md b/storage_scrubber/README.md similarity index 78% rename from s3_scrubber/README.md rename to storage_scrubber/README.md index 2f21b9f191..0930f343ec 100644 --- a/s3_scrubber/README.md +++ b/storage_scrubber/README.md @@ -1,4 +1,4 @@ -# Neon S3 scrubber +# Neon Storage Scrubber This tool directly accesses the S3 buckets used by the Neon `pageserver` and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist. @@ -9,11 +9,13 @@ and `safekeeper`, and does housekeeping such as cleaning up objects for tenants #### S3 -Do `aws sso login --profile dev` to get the SSO access to the bucket to clean, get the SSO_ACCOUNT_ID for your profile (`cat ~/.aws/config` may help). +Do `aws sso login --profile dev` to get the SSO access to the bucket to clean. +Also, set the following environment variables: -- `SSO_ACCOUNT_ID`: Credentials id to use for accessing S3 buckets +- `AWS_PROFILE`: Profile name to use for accessing S3 buckets (e.g. `dev`) - `REGION`: A region where the bucket is located at. - `BUCKET`: Bucket name +- `BUCKET_PREFIX` (optional): Prefix inside the bucket #### Console API @@ -43,7 +45,7 @@ processing by the `purge-garbage` subcommand. Example: -`env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- find-garbage --node-kind=pageserver --depth=tenant --output-path=eu-west-1-garbage.json` #### `purge-garbage` @@ -59,7 +61,7 @@ to pass them on the command line Example: -`env SSO_ACCOUNT_ID=123456 cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` +`env AWS_PROFILE=dev cargo run --release -- purge-garbage --node-kind=pageserver --depth=tenant --input-path=eu-west-1-garbage.json` Add the `--delete` argument before `purge-garbage` to enable deletion. This is intentionally not provided inline in the example above to avoid accidents. Without the `--delete` flag @@ -67,10 +69,12 @@ the purge command will log all the keys that it would have deleted. #### `scan-metadata` -Walk objects in a pageserver S3 bucket, and report statistics on the contents. +Walk objects in a pageserver or safekeeper S3 bucket, and report statistics on the contents and checking consistency. +Errors are logged to stderr and summary to stdout. +For pageserver: ``` -env SSO_ACCOUNT_ID=123456 REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata +env AWS_PROFILE=dev REGION=eu-west-1 BUCKET=my-dev-bucket CLOUD_ADMIN_API_TOKEN=${NEON_CLOUD_ADMIN_API_STAGING_KEY} CLOUD_ADMIN_API_URL=[url] cargo run --release -- scan-metadata --node-kind pageserver Timelines: 31106 With errors: 3 @@ -82,6 +86,10 @@ Layer size bytes: min 24576, 1% 36879, 10% 36879, 50% 61471, 90% 44695551, 99% 2 Timeline layer count: min 1, 1% 3, 10% 6, 50% 16, 90% 25, 99% 39, max 1053 ``` +For safekeepers, dump_db_connstr and dump_db_table must be +specified; they should point to table with debug dump which will be used +to list timelines and find their backup and start LSNs. + ## Cleaning up running pageservers If S3 state is altered first manually, pageserver in-memory state will contain wrong data about S3 state, and tenants/timelines may get recreated on S3 (due to any layer upload due to compaction, pageserver restart, etc.). So before proceeding, for tenants/timelines which are already deleted in the console, we must remove these from pageservers. diff --git a/s3_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs similarity index 80% rename from s3_scrubber/src/checks.rs rename to storage_scrubber/src/checks.rs index 7b9f96dce3..4eb8580e32 100644 --- a/s3_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,8 +1,8 @@ use std::collections::{HashMap, HashSet}; use anyhow::Context; -use aws_sdk_s3::{types::ObjectIdentifier, Client}; -use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata; +use aws_sdk_s3::Client; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver_api::shard::ShardIndex; use tracing::{error, info, warn}; use utils::generation::Generation; @@ -11,9 +11,9 @@ use utils::id::TimelineId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; -use futures_util::{pin_mut, StreamExt}; +use futures_util::StreamExt; use pageserver::tenant::remote_timeline_client::parse_remote_index_path; -use pageserver::tenant::storage_layer::LayerFileName; +use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use remote_storage::RemotePath; @@ -70,7 +70,7 @@ pub(crate) fn branch_cleanup_and_check_errors( match s3_data { Some(s3_data) => { - result.garbage_keys.extend(s3_data.keys_to_remove); + result.garbage_keys.extend(s3_data.unknown_keys); match s3_data.blob_data { BlobDataParseResult::Parsed { @@ -78,27 +78,26 @@ pub(crate) fn branch_cleanup_and_check_errors( index_part_generation: _index_part_generation, s3_layers: _s3_layers, } => { - if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) { - result.errors.push(format!( - "index_part.json version: {}", - index_part.get_version() - )) + if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) { + result + .errors + .push(format!("index_part.json version: {}", index_part.version())) } - if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { + if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { result.warnings.push(format!( "index_part.json version is not latest: {}", - index_part.get_version() + index_part.version() )) } if index_part.metadata.disk_consistent_lsn() - != index_part.get_disk_consistent_lsn() + != index_part.duplicated_disk_consistent_lsn() { result.errors.push(format!( "Mismatching disk_consistent_lsn in TimelineMetadata ({}) and in the index_part ({})", index_part.metadata.disk_consistent_lsn(), - index_part.get_disk_consistent_lsn(), + index_part.duplicated_disk_consistent_lsn(), )) } @@ -110,7 +109,7 @@ pub(crate) fn branch_cleanup_and_check_errors( for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( - "index_part.json contains a layer {} that has 0 size in its layer metadata", layer.file_name(), + "index_part.json contains a layer {} that has 0 size in its layer metadata", layer, )) } @@ -121,7 +120,7 @@ pub(crate) fn branch_cleanup_and_check_errors( // layer we think is missing. result.errors.push(format!( "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage", - layer.file_name(), + layer, metadata.generation.get_suffix(), metadata.shard )) @@ -170,8 +169,7 @@ pub(crate) struct LayerRef { /// the tenant to query whether an object exists. #[derive(Default)] pub(crate) struct TenantObjectListing { - shard_timelines: - HashMap<(ShardIndex, TimelineId), HashMap<(LayerFileName, Generation), LayerRef>>, + shard_timelines: HashMap<(ShardIndex, TimelineId), HashMap<(LayerName, Generation), LayerRef>>, } impl TenantObjectListing { @@ -180,7 +178,7 @@ impl TenantObjectListing { pub(crate) fn push( &mut self, ttid: TenantShardTimelineId, - layers: HashSet<(LayerFileName, Generation)>, + layers: HashSet<(LayerName, Generation)>, ) { let shard_index = ShardIndex::new( ttid.tenant_shard_id.shard_number, @@ -208,8 +206,8 @@ impl TenantObjectListing { pub(crate) fn check_ref( &mut self, timeline_id: TimelineId, - layer_file: &LayerFileName, - metadata: &IndexLayerMetadata, + layer_file: &LayerName, + metadata: &LayerFileMetadata, ) -> bool { let Some(shard_tl) = self.shard_timelines.get_mut(&(metadata.shard, timeline_id)) else { return false; @@ -224,7 +222,7 @@ impl TenantObjectListing { true } - pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerFileName, Generation)> { + pub(crate) fn get_orphans(&self) -> Vec<(ShardIndex, TimelineId, LayerName, Generation)> { let mut result = Vec::new(); for ((shard_index, timeline_id), layers) in &self.shard_timelines { for ((layer_file, generation), layer_ref) in layers { @@ -241,31 +239,36 @@ impl TenantObjectListing { #[derive(Debug)] pub(crate) struct S3TimelineBlobData { pub(crate) blob_data: BlobDataParseResult, - pub(crate) keys_to_remove: Vec, + + // Index objects that were not used when loading `blob_data`, e.g. those from old generations + pub(crate) unused_index_keys: Vec, + + // Objects whose keys were not recognized at all, i.e. not layer files, not indices + pub(crate) unknown_keys: Vec, } #[derive(Debug)] pub(crate) enum BlobDataParseResult { Parsed { - index_part: IndexPart, + index_part: Box, index_part_generation: Generation, - s3_layers: HashSet<(LayerFileName, Generation)>, + s3_layers: HashSet<(LayerName, Generation)>, }, /// The remains of a deleted Timeline (i.e. an initdb archive only) Relic, Incorrect(Vec), } -fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> { +fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? Some((layer_filename, gen)) if gen.len() == 8 => { - let layer = layer_filename.parse::()?; + let layer = layer_filename.parse::()?; let gen = Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?; Ok((layer, gen)) } - _ => Ok((name.parse::()?, Generation::none())), + _ => Ok((name.parse::()?, Generation::none())), } } @@ -277,16 +280,15 @@ pub(crate) async fn list_timeline_blobs( let mut s3_layers = HashSet::new(); let mut errors = Vec::new(); - let mut keys_to_remove = Vec::new(); + let mut unknown_keys = Vec::new(); let mut timeline_dir_target = s3_root.timeline_root(&id); timeline_dir_target.delimiter = String::new(); - let mut index_parts: Vec = Vec::new(); + let mut index_part_keys: Vec = Vec::new(); let mut initdb_archive: bool = false; - let stream = stream_listing(s3_client, &timeline_dir_target); - pin_mut!(stream); + let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); while let Some(obj) = stream.next().await { let obj = obj?; let key = obj.key(); @@ -294,16 +296,16 @@ pub(crate) async fn list_timeline_blobs( let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket); match blob_name { Some(name) if name.starts_with("index_part.json") => { - tracing::info!("Index key {key}"); - index_parts.push(obj) + tracing::debug!("Index key {key}"); + index_part_keys.push(key.to_owned()) } Some("initdb.tar.zst") => { - tracing::info!("initdb archive {key}"); + tracing::debug!("initdb archive {key}"); initdb_archive = true; } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { Ok((new_layer, gen)) => { - tracing::info!("Parsed layer key: {} {:?}", new_layer, gen); + tracing::debug!("Parsed layer key: {} {:?}", new_layer, gen); s3_layers.insert((new_layer, gen)); } Err(e) => { @@ -311,37 +313,37 @@ pub(crate) async fn list_timeline_blobs( errors.push( format!("S3 list response got an object with key {key} that is not a layer name: {e}"), ); - keys_to_remove.push(key.to_string()); + unknown_keys.push(key.to_string()); } }, None => { - tracing::info!("Peculiar key {}", key); + tracing::warn!("Unknown key {}", key); errors.push(format!("S3 list response got an object with odd key {key}")); - keys_to_remove.push(key.to_string()); + unknown_keys.push(key.to_string()); } } } - if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive { - tracing::info!( + if index_part_keys.is_empty() && s3_layers.is_empty() && initdb_archive { + tracing::debug!( "Timeline is empty apart from initdb archive: expected post-deletion state." ); return Ok(S3TimelineBlobData { blob_data: BlobDataParseResult::Relic, - keys_to_remove: Vec::new(), + unused_index_keys: index_part_keys, + unknown_keys: Vec::new(), }); } // Choose the index_part with the highest generation - let (index_part_object, index_part_generation) = match index_parts + let (index_part_object, index_part_generation) = match index_part_keys .iter() - .filter_map(|k| { - let key = k.key(); + .filter_map(|key| { // Stripping the index key to the last part, because RemotePath doesn't // like absolute paths, and depending on prefix_in_bucket it's possible // for the keys we read back to start with a slash. let basename = key.rsplit_once('/').unwrap().1; - parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g)) + parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (key, g)) }) .max_by_key(|i| i.1) .map(|(k, g)| (k.clone(), g)) @@ -349,15 +351,18 @@ pub(crate) async fn list_timeline_blobs( Some((key, gen)) => (Some(key), gen), None => { // Legacy/missing case: one or zero index parts, which did not have a generation - (index_parts.pop(), Generation::none()) + (index_part_keys.pop(), Generation::none()) } }; - if index_part_object.is_none() { - errors.push("S3 list response got no index_part.json file".to_string()); + match index_part_object.as_ref() { + Some(selected) => index_part_keys.retain(|k| k != selected), + None => { + errors.push("S3 list response got no index_part.json file".to_string()); + } } - if let Some(index_part_object_key) = index_part_object.as_ref().map(|object| object.key()) { + if let Some(index_part_object_key) = index_part_object.as_ref() { let index_part_bytes = download_object_with_retries( s3_client, &timeline_dir_target.bucket_name, @@ -370,21 +375,18 @@ pub(crate) async fn list_timeline_blobs( Ok(index_part) => { return Ok(S3TimelineBlobData { blob_data: BlobDataParseResult::Parsed { - index_part, + index_part: Box::new(index_part), index_part_generation, s3_layers, }, - keys_to_remove, + unused_index_keys: index_part_keys, + unknown_keys, }) } Err(index_parse_error) => errors.push(format!( "index_part.json body parsing error: {index_parse_error}" )), } - } else { - errors.push(format!( - "Index part object {index_part_object:?} has no key" - )); } if errors.is_empty() { @@ -395,6 +397,7 @@ pub(crate) async fn list_timeline_blobs( Ok(S3TimelineBlobData { blob_data: BlobDataParseResult::Incorrect(errors), - keys_to_remove, + unused_index_keys: index_part_keys, + unknown_keys, }) } diff --git a/s3_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs similarity index 71% rename from s3_scrubber/src/cloud_admin_api.rs rename to storage_scrubber/src/cloud_admin_api.rs index 151421c84f..70b108cf23 100644 --- a/s3_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -1,15 +1,13 @@ -#![allow(unused)] - -use std::str::FromStr; -use std::time::Duration; - use chrono::{DateTime, Utc}; +use futures::Future; use hex::FromHex; -use pageserver::tenant::Tenant; + use reqwest::{header, Client, StatusCode, Url}; use serde::Deserialize; use tokio::sync::Semaphore; +use tokio_util::sync::CancellationToken; +use utils::backoff; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -141,7 +139,7 @@ pub struct ProjectData { pub region_id: String, pub platform_id: String, pub user_id: String, - pub pageserver_id: u64, + pub pageserver_id: Option, #[serde(deserialize_with = "from_nullable_id")] pub tenant: TenantId, pub safekeepers: Vec, @@ -159,7 +157,7 @@ pub struct ProjectData { pub maintenance_set: Option, } -#[derive(Debug, serde::Deserialize)] +#[derive(Debug, Clone, serde::Deserialize)] pub struct BranchData { pub id: BranchId, pub created_at: DateTime, @@ -214,30 +212,39 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("tenant_id", tenant_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("tenant_id", tenant_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::RequestSend(e), + ) + })?; + + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find project for tenant".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_tenant_project", + ) + .await?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find project for tenant".to_string(), - ErrorKind::BodyRead(e), - ) - })?; match response.data.len() { 0 => Ok(None), 1 => Ok(Some( @@ -265,42 +272,34 @@ impl CloudAdminApiClient { const PAGINATION_LIMIT: usize = 512; let mut result: Vec = Vec::with_capacity(PAGINATION_LIMIT); loop { - let response = self - .http_client - .get(self.append_url("/projects")) - .query(&[ - ("show_deleted", "false".to_string()), - ("limit", format!("{PAGINATION_LIMIT}")), - ("offset", format!("{pagination_offset}")), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "List active projects".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response_bytes = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/projects")) + .query(&[ + ("show_deleted", "false".to_string()), + ("limit", format!("{PAGINATION_LIMIT}")), + ("offset", format!("{pagination_offset}")), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "List active projects".to_string(), + ErrorKind::RequestSend(e), + ) + })?; - match response.status() { - StatusCode::OK => {} - StatusCode::SERVICE_UNAVAILABLE | StatusCode::TOO_MANY_REQUESTS => { - tokio::time::sleep(Duration::from_millis(500)).await; - continue; - } - status => { - return Err(Error::new( - "List active projects".to_string(), - ErrorKind::ResponseStatus(response.status()), - )) - } - } - - let response_bytes = response.bytes().await.map_err(|e| { - Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) - })?; + response.bytes().await.map_err(|e| { + Error::new("List active projects".to_string(), ErrorKind::BodyRead(e)) + }) + }, + "list_projects", + ) + .await?; let decode_result = serde_json::from_slice::>>(&response_bytes); @@ -331,6 +330,7 @@ impl CloudAdminApiClient { pub async fn find_timeline_branch( &self, + tenant_id: TenantId, timeline_id: TimelineId, ) -> Result, Error> { let _permit = self @@ -339,43 +339,61 @@ impl CloudAdminApiClient { .await .expect("Semaphore is not closed"); - let response = self - .http_client - .get(self.append_url("/branches")) - .query(&[ - ("timeline_id", timeline_id.to_string()), - ("show_deleted", "true".to_string()), - ]) - .header(header::ACCEPT, "application/json") - .bearer_auth(&self.token) - .send() - .await - .map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::RequestSend(e), - ) - })?; + let response = CloudAdminApiClient::with_retries( + || async { + let response = self + .http_client + .get(self.append_url("/branches")) + .query(&[ + ("timeline_id", timeline_id.to_string()), + ("show_deleted", "true".to_string()), + ]) + .header(header::ACCEPT, "application/json") + .bearer_auth(&self.token) + .send() + .await + .map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::RequestSend(e), + ) + })?; - let response: AdminApiResponse> = response.json().await.map_err(|e| { - Error::new( - "Find branch for timeline".to_string(), - ErrorKind::BodyRead(e), - ) - })?; - match response.data.len() { - 0 => Ok(None), - 1 => Ok(Some( - response - .data - .into_iter() - .next() - .expect("Should have exactly one element"), - )), - too_many => Err(Error::new( - format!("Find branch for timeline returned {too_many} branches instead of 0 or 1"), + let response: AdminApiResponse> = + response.json().await.map_err(|e| { + Error::new( + "Find branch for timeline".to_string(), + ErrorKind::BodyRead(e), + ) + })?; + Ok(response) + }, + "find_timeline_branch", + ) + .await?; + + let mut branches: Vec = response.data.into_iter().collect(); + // Normally timeline_id is unique. However, we do have at least one case + // of the same timeline_id in two different projects, apparently after + // manual recovery. So always recheck project_id (discovered through + // tenant_id). + let project_data = match self.find_tenant_project(tenant_id).await? { + Some(pd) => pd, + None => return Ok(None), + }; + branches.retain(|b| b.project_id == project_data.id); + if branches.len() < 2 { + Ok(branches.first().cloned()) + } else { + Err(Error::new( + format!( + "Find branch for timeline {}/{} returned {} branches instead of 0 or 1", + tenant_id, + timeline_id, + branches.len() + ), ErrorKind::UnexpectedState, - )), + )) } } @@ -536,4 +554,15 @@ impl CloudAdminApiClient { .parse() .unwrap_or_else(|e| panic!("Could not append {subpath} to base url: {e}")) } + + async fn with_retries(op: O, description: &str) -> Result + where + O: FnMut() -> F, + F: Future>, + { + let cancel = CancellationToken::new(); // not really used + backoff::retry(op, |_| false, 1, 20, description, &cancel) + .await + .expect("cancellations are disabled") + } } diff --git a/s3_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs similarity index 85% rename from s3_scrubber/src/garbage.rs rename to storage_scrubber/src/garbage.rs index 93bb115883..ce0ff10ec6 100644 --- a/s3_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -12,7 +12,7 @@ use aws_sdk_s3::{ types::{Delete, ObjectIdentifier}, Client, }; -use futures_util::{pin_mut, TryStreamExt}; +use futures_util::TryStreamExt; use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; use tokio_stream::StreamExt; @@ -60,6 +60,7 @@ pub struct GarbageList { /// see garbage, we saw some active tenants too. This protects against classes of bugs /// in the scrubber that might otherwise generate a "deleted all" result. active_tenant_count: usize, + active_timeline_count: usize, } impl GarbageList { @@ -67,6 +68,7 @@ impl GarbageList { Self { items: Vec::new(), active_tenant_count: 0, + active_timeline_count: 0, node_kind, bucket_config, } @@ -119,7 +121,10 @@ pub async fn find_garbage( const S3_CONCURRENCY: usize = 32; // How many concurrent API requests to make to the console API. -const CONSOLE_CONCURRENCY: usize = 128; +// +// Be careful increasing this; roughly we shouldn't have more than ~100 rps. It +// would be better to implement real rsp limiter. +const CONSOLE_CONCURRENCY: usize = 16; struct ConsoleCache { /// Set of tenants found in the control plane API @@ -199,12 +204,12 @@ async fn find_garbage_inner( } } }); - let tenants_checked = tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY); + let mut tenants_checked = + std::pin::pin!(tenants_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Process the results of Tenant checks. If a Tenant is garbage, it goes into // the `GarbageList`. Else it goes into `active_tenants` for more detailed timeline // checks if they are enabled by the `depth` parameter. - pin_mut!(tenants_checked); let mut garbage = GarbageList::new(node_kind, bucket_config); let mut active_tenants: Vec = vec![]; let mut counter = 0; @@ -221,6 +226,7 @@ async fn find_garbage_inner( } else { tracing::debug!("Tenant {tenant_shard_id} is active"); active_tenants.push(tenant_shard_id); + garbage.active_tenant_count = active_tenants.len(); } counter += 1; @@ -261,25 +267,39 @@ async fn find_garbage_inner( let api_client = cloud_admin_api_client.clone(); async move { api_client - .find_timeline_branch(ttid.timeline_id) + .find_timeline_branch(ttid.tenant_shard_id.tenant_id, ttid.timeline_id) .await .map_err(|e| anyhow::anyhow!(e)) .map(|r| (ttid, r)) } }); - let timelines_checked = timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY); + let mut timelines_checked = + std::pin::pin!(timelines_checked.try_buffer_unordered(CONSOLE_CONCURRENCY)); // Update the GarbageList with any timelines which appear not to exist. - pin_mut!(timelines_checked); + let mut active_timelines: Vec = vec![]; while let Some(result) = timelines_checked.next().await { let (ttid, console_result) = result?; if garbage.maybe_append(GarbageEntity::Timeline(ttid), console_result) { tracing::debug!("Timeline {ttid} is garbage"); } else { tracing::debug!("Timeline {ttid} is active"); + active_timelines.push(ttid); + garbage.active_timeline_count = active_timelines.len(); } } + let num_garbage_timelines = garbage + .items + .iter() + .filter(|g| matches!(g.entity, GarbageEntity::Timeline(_))) + .count(); + tracing::info!( + "Found {}/{} garbage timelines in active tenants", + num_garbage_timelines, + active_timelines.len(), + ); + Ok(garbage) } @@ -344,16 +364,22 @@ pub async fn get_timeline_objects( const MAX_KEYS_PER_DELETE: usize = 1000; /// Drain a buffer of keys into DeleteObjects requests +/// +/// If `drain` is true, drains keys completely; otherwise stops when < +/// MAX_KEYS_PER_DELETE keys are left. +/// `num_deleted` returns number of deleted keys. async fn do_delete( s3_client: &Arc, bucket_name: &str, keys: &mut Vec, dry_run: bool, drain: bool, + progress_tracker: &mut DeletionProgressTracker, ) -> anyhow::Result<()> { while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) { let request_keys = keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len()))); + let num_deleted = request_keys.len(); if dry_run { tracing::info!("Dry-run deletion of objects: "); for k in request_keys { @@ -368,12 +394,30 @@ async fn do_delete( .send() .await .context("DeleteObjects request")?; + progress_tracker.register(num_deleted); } } Ok(()) } +/// Simple tracker reporting each 10k deleted keys. +#[derive(Default)] +struct DeletionProgressTracker { + num_deleted: usize, + last_reported_num_deleted: usize, +} + +impl DeletionProgressTracker { + fn register(&mut self, n: usize) { + self.num_deleted += n; + if self.num_deleted - self.last_reported_num_deleted > 10000 { + tracing::info!("progress: deleted {} keys", self.num_deleted); + self.last_reported_num_deleted = self.num_deleted; + } + } +} + pub async fn purge_garbage( input_path: String, mode: PurgeMode, @@ -394,6 +438,14 @@ pub async fn purge_garbage( if garbage_list.active_tenant_count == 0 { anyhow::bail!("Refusing to purge a garbage list that reports 0 active tenants"); } + if garbage_list + .items + .iter() + .any(|g| matches!(g.entity, GarbageEntity::Timeline(_))) + && garbage_list.active_timeline_count == 0 + { + anyhow::bail!("Refusing to purge a garbage list containing garbage timelines that reports 0 active timelines"); + } let filtered_items = garbage_list .items @@ -425,10 +477,11 @@ pub async fn purge_garbage( } } }); - let get_objects_results = get_objects_results.try_buffer_unordered(S3_CONCURRENCY); + let mut get_objects_results = + std::pin::pin!(get_objects_results.try_buffer_unordered(S3_CONCURRENCY)); - pin_mut!(get_objects_results); let mut objects_to_delete = Vec::new(); + let mut progress_tracker = DeletionProgressTracker::default(); while let Some(result) = get_objects_results.next().await { let mut object_list = result?; objects_to_delete.append(&mut object_list); @@ -439,6 +492,7 @@ pub async fn purge_garbage( &mut objects_to_delete, dry_run, false, + &mut progress_tracker, ) .await?; } @@ -450,10 +504,11 @@ pub async fn purge_garbage( &mut objects_to_delete, dry_run, true, + &mut progress_tracker, ) .await?; - tracing::info!("Fell through"); + tracing::info!("{} keys deleted in total", progress_tracker.num_deleted); Ok(()) } diff --git a/s3_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs similarity index 76% rename from s3_scrubber/src/lib.rs rename to storage_scrubber/src/lib.rs index d2842877d0..64273432fc 100644 --- a/s3_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -4,7 +4,10 @@ pub mod checks; pub mod cloud_admin_api; pub mod garbage; pub mod metadata_stream; -pub mod scan_metadata; +pub mod pageserver_physical_gc; +pub mod scan_pageserver_metadata; +pub mod scan_safekeeper_metadata; +pub mod tenant_snapshot; use std::env; use std::fmt::Display; @@ -23,17 +26,18 @@ use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep}; use aws_sdk_s3::{Client, Config}; use aws_smithy_async::rt::sleep::TokioSleep; +use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; use reqwest::Url; use serde::{Deserialize, Serialize}; -use std::io::IsTerminal; use tokio::io::AsyncReadExt; use tracing::error; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; -use utils::id::TimelineId; +use utils::fs_ext; +use utils::id::{TenantId, TimelineId}; const MAX_RETRIES: usize = 20; const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN"; @@ -139,12 +143,34 @@ impl RootTarget { pub fn tenants_root(&self) -> S3Target { match self { Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME), - Self::Safekeeper(root) => root.with_sub_segment("wal"), + Self::Safekeeper(root) => root.clone(), } } pub fn tenant_root(&self, tenant_id: &TenantShardId) -> S3Target { - self.tenants_root().with_sub_segment(&tenant_id.to_string()) + match self { + Self::Pageserver(_) => self.tenants_root().with_sub_segment(&tenant_id.to_string()), + Self::Safekeeper(_) => self + .tenants_root() + .with_sub_segment(&tenant_id.tenant_id.to_string()), + } + } + + pub(crate) fn tenant_shards_prefix(&self, tenant_id: &TenantId) -> S3Target { + // Only pageserver remote storage contains tenant-shards + assert!(matches!(self, Self::Pageserver(_))); + let Self::Pageserver(root) = self else { + panic!(); + }; + + S3Target { + bucket_name: root.bucket_name.clone(), + prefix_in_bucket: format!( + "{}/{TENANTS_SEGMENT_NAME}/{tenant_id}", + root.prefix_in_bucket + ), + delimiter: root.delimiter.clone(), + } } pub fn timelines_root(&self, tenant_id: &TenantShardId) -> S3Target { @@ -175,30 +201,15 @@ impl RootTarget { } #[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] pub struct BucketConfig { pub region: String, pub bucket: String, pub prefix_in_bucket: Option, - - /// Use SSO if this is set, else rely on AWS_* environment vars - pub sso_account_id: Option, -} - -impl Display for BucketConfig { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}/{}/{}", - self.sso_account_id.as_deref().unwrap_or(""), - self.region, - self.bucket - ) - } } impl BucketConfig { pub fn from_env() -> anyhow::Result { - let sso_account_id = env::var("SSO_ACCOUNT_ID").ok(); let region = env::var("REGION").context("'REGION' param retrieval")?; let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?; let prefix_in_bucket = env::var("BUCKET_PREFIX").ok(); @@ -207,7 +218,6 @@ impl BucketConfig { region, bucket, prefix_in_bucket, - sso_account_id, }) } } @@ -240,7 +250,6 @@ pub fn init_logging(file_name: &str) -> WorkerGuard { .with_ansi(false) .with_writer(file_writer); let stderr_logs = fmt::Layer::new() - .with_ansi(std::io::stderr().is_terminal()) .with_target(false) .with_writer(std::io::stderr); tracing_subscriber::registry() @@ -252,7 +261,7 @@ pub fn init_logging(file_name: &str) -> WorkerGuard { guard } -pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Client { +pub fn init_s3_client(bucket_region: Region) -> Client { let credentials_provider = { // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" let chain = CredentialsProviderChain::first_try( @@ -266,7 +275,7 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie ); // Use SSO if we were given an account ID - match account_id { + match std::env::var("SSO_ACCOUNT_ID").ok() { Some(sso_account) => chain.or_else( "sso", SsoCredentialsProvider::builder() @@ -288,7 +297,10 @@ pub fn init_s3_client(account_id: Option, bucket_region: Region) -> Clie let sleep_impl: Arc = Arc::new(TokioSleep::new()); let mut builder = Config::builder() - .behavior_version(BehaviorVersion::v2023_11_09()) + .behavior_version( + #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ + BehaviorVersion::v2023_11_09(), + ) .region(bucket_region) .retry_config(RetryConfig::adaptive().with_max_attempts(3)) .sleep_impl(SharedAsyncSleep::from(sleep_impl)) @@ -307,7 +319,7 @@ fn init_remote( ) -> anyhow::Result<(Arc, RootTarget)> { let bucket_region = Region::new(bucket_config.region); let delimiter = "/".to_string(); - let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region)); + let s3_client = Arc::new(init_s3_client(bucket_region)); let s3_root = match node_kind { NodeKind::Pageserver => RootTarget::Pageserver(S3Target { @@ -319,9 +331,7 @@ fn init_remote( }), NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target { bucket_name: bucket_config.bucket, - prefix_in_bucket: bucket_config - .prefix_in_bucket - .unwrap_or("safekeeper/v1".to_string()), + prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or("wal/".to_string()), delimiter, }), }; @@ -346,7 +356,10 @@ async fn list_objects_with_retries( { Ok(response) => return Ok(response), Err(e) => { - error!("list_objects_v2 query failed: {e}"); + error!( + "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}", + s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter + ); tokio::time::sleep(Duration::from_secs(1)).await; } } @@ -384,7 +397,7 @@ async fn download_object_with_retries( .await { Ok(bytes_read) => { - tracing::info!("Downloaded {bytes_read} bytes for object object with key {key}"); + tracing::debug!("Downloaded {bytes_read} bytes for object {key}"); return Ok(body_buf); } Err(e) => { @@ -396,3 +409,50 @@ async fn download_object_with_retries( anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") } + +async fn download_object_to_file( + s3_client: &Client, + bucket_name: &str, + key: &str, + version_id: Option<&str>, + local_path: &Utf8Path, +) -> anyhow::Result<()> { + let tmp_path = Utf8PathBuf::from(format!("{local_path}.tmp")); + for _ in 0..MAX_RETRIES { + tokio::fs::remove_file(&tmp_path) + .await + .or_else(fs_ext::ignore_not_found)?; + + let mut file = tokio::fs::File::create(&tmp_path) + .await + .context("Opening output file")?; + + let request = s3_client.get_object().bucket(bucket_name).key(key); + + let request = match version_id { + Some(version_id) => request.version_id(version_id), + None => request, + }; + + let response_stream = match request.send().await { + Ok(response) => response, + Err(e) => { + error!( + "Failed to download object for key {key} version {}: {e:#}", + version_id.unwrap_or("") + ); + tokio::time::sleep(Duration::from_secs(1)).await; + continue; + } + }; + + let mut read_stream = response_stream.body.into_async_read(); + + tokio::io::copy(&mut read_stream, &mut file).await?; + + tokio::fs::rename(&tmp_path, local_path).await?; + return Ok(()); + } + + anyhow::bail!("Failed to download objects with key {key} {MAX_RETRIES} times") +} diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs new file mode 100644 index 0000000000..222bd10ed2 --- /dev/null +++ b/storage_scrubber/src/main.rs @@ -0,0 +1,203 @@ +use anyhow::bail; +use camino::Utf8PathBuf; +use pageserver_api::shard::TenantShardId; +use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; +use storage_scrubber::pageserver_physical_gc::GcMode; +use storage_scrubber::scan_pageserver_metadata::scan_metadata; +use storage_scrubber::tenant_snapshot::SnapshotDownloader; +use storage_scrubber::{ + init_logging, pageserver_physical_gc::pageserver_physical_gc, + scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, + TraversingDepth, +}; + +use clap::{Parser, Subcommand}; +use utils::id::TenantId; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +#[command(arg_required_else_help(true))] +struct Cli { + #[command(subcommand)] + command: Command, + + #[arg(short, long, default_value_t = false)] + delete: bool, +} + +#[derive(Subcommand, Debug)] +enum Command { + FindGarbage { + #[arg(short, long)] + node_kind: NodeKind, + #[arg(short, long, default_value_t=TraversingDepth::Tenant)] + depth: TraversingDepth, + #[arg(short, long, default_value_t = String::from("garbage.json"))] + output_path: String, + }, + PurgeGarbage { + #[arg(short, long)] + input_path: String, + #[arg(short, long, default_value_t = PurgeMode::DeletedOnly)] + mode: PurgeMode, + }, + #[command(verbatim_doc_comment)] + ScanMetadata { + #[arg(short, long)] + node_kind: NodeKind, + #[arg(short, long, default_value_t = false)] + json: bool, + #[arg(long = "tenant-id", num_args = 0..)] + tenant_ids: Vec, + #[arg(long, default_value = None)] + /// For safekeeper node_kind only, points to db with debug dump + dump_db_connstr: Option, + /// For safekeeper node_kind only, table in the db with debug dump + #[arg(long, default_value = None)] + dump_db_table: Option, + }, + TenantSnapshot { + #[arg(long = "tenant-id")] + tenant_id: TenantId, + #[arg(long = "concurrency", short = 'j', default_value_t = 8)] + concurrency: usize, + #[arg(short, long)] + output_path: Utf8PathBuf, + }, + PageserverPhysicalGc { + #[arg(long = "tenant-id", num_args = 0..)] + tenant_ids: Vec, + #[arg(long = "min-age")] + min_age: humantime::Duration, + #[arg(short, long, default_value_t = GcMode::IndicesOnly)] + mode: GcMode, + }, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + + let bucket_config = BucketConfig::from_env()?; + + let command_log_name = match &cli.command { + Command::ScanMetadata { .. } => "scan", + Command::FindGarbage { .. } => "find-garbage", + Command::PurgeGarbage { .. } => "purge-garbage", + Command::TenantSnapshot { .. } => "tenant-snapshot", + Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc", + }; + let _guard = init_logging(&format!( + "{}_{}_{}_{}.log", + std::env::args().next().unwrap(), + command_log_name, + bucket_config.bucket, + chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S") + )); + + match cli.command { + Command::ScanMetadata { + json, + tenant_ids, + node_kind, + dump_db_connstr, + dump_db_table, + } => { + if let NodeKind::Safekeeper = node_kind { + let dump_db_connstr = + dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?; + let dump_db_table = + dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?; + + let summary = scan_safekeeper_metadata( + bucket_config.clone(), + tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(), + dump_db_connstr, + dump_db_table, + ) + .await?; + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + if summary.is_fatal() { + bail!("Fatal scrub errors detected"); + } + if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + bail!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + ); + } + Ok(()) + } else { + match scan_metadata(bucket_config.clone(), tenant_ids).await { + Err(e) => { + tracing::error!("Failed: {e}"); + Err(e) + } + Ok(summary) => { + if json { + println!("{}", serde_json::to_string(&summary).unwrap()) + } else { + println!("{}", summary.summary_string()); + } + if summary.is_fatal() { + Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) + } else { + Ok(()) + } + } + } + } + } + Command::FindGarbage { + node_kind, + depth, + output_path, + } => { + let console_config = ConsoleConfig::from_env()?; + find_garbage(bucket_config, console_config, depth, node_kind, output_path).await + } + Command::PurgeGarbage { input_path, mode } => { + purge_garbage(input_path, mode, !cli.delete).await + } + Command::TenantSnapshot { + tenant_id, + output_path, + concurrency, + } => { + let downloader = + SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?; + downloader.download().await + } + Command::PageserverPhysicalGc { + tenant_ids, + min_age, + mode, + } => { + let summary = + pageserver_physical_gc(bucket_config, tenant_ids, min_age.into(), mode).await?; + println!("{}", serde_json::to_string(&summary).unwrap()); + Ok(()) + } + } +} diff --git a/s3_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs similarity index 70% rename from s3_scrubber/src/metadata_stream.rs rename to storage_scrubber/src/metadata_stream.rs index 073f37f319..c05874f556 100644 --- a/s3_scrubber/src/metadata_stream.rs +++ b/storage_scrubber/src/metadata_stream.rs @@ -5,7 +5,7 @@ use tokio_stream::Stream; use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId}; use pageserver_api::shard::TenantShardId; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; /// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2 pub fn stream_tenants<'a>( @@ -45,6 +45,62 @@ pub fn stream_tenants<'a>( } } +pub async fn stream_tenant_shards<'a>( + s3_client: &'a Client, + target: &'a RootTarget, + tenant_id: TenantId, +) -> anyhow::Result> + 'a> { + let mut tenant_shard_ids: Vec> = Vec::new(); + let mut continuation_token = None; + let shards_target = target.tenant_shards_prefix(&tenant_id); + + loop { + tracing::info!("Listing in {}", shards_target.prefix_in_bucket); + let fetch_response = + list_objects_with_retries(s3_client, &shards_target, continuation_token.clone()).await; + let fetch_response = match fetch_response { + Err(e) => { + tenant_shard_ids.push(Err(e)); + break; + } + Ok(r) => r, + }; + + let new_entry_ids = fetch_response + .common_prefixes() + .iter() + .filter_map(|prefix| prefix.prefix()) + .filter_map(|prefix| -> Option<&str> { + prefix + .strip_prefix(&target.tenants_root().prefix_in_bucket)? + .strip_suffix('/') + }) + .map(|entry_id_str| { + let first_part = entry_id_str.split('/').next().unwrap(); + + first_part + .parse::() + .with_context(|| format!("Incorrect entry id str: {first_part}")) + }); + + for i in new_entry_ids { + tenant_shard_ids.push(i); + } + + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok(stream! { + for i in tenant_shard_ids { + let id = i?; + yield Ok(id); + } + }) +} + /// Given a TenantShardId, output a stream of the timelines within that tenant, discovered /// using ListObjectsv2. The listing is done before the stream is built, so that this /// function can be used to generate concurrency on a stream using buffer_unordered. @@ -58,7 +114,7 @@ pub async fn stream_tenant_timelines<'a>( let timelines_target = target.timelines_root(&tenant); loop { - tracing::info!("Listing in {}", tenant); + tracing::debug!("Listing in {}", tenant); let fetch_response = list_objects_with_retries(s3_client, &timelines_target, continuation_token.clone()) .await; @@ -95,7 +151,7 @@ pub async fn stream_tenant_timelines<'a>( } } - tracing::info!("Yielding for {}", tenant); + tracing::debug!("Yielding for {}", tenant); Ok(stream! { for i in timeline_ids { let id = i?; diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs new file mode 100644 index 0000000000..0146433128 --- /dev/null +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -0,0 +1,239 @@ +use std::time::{Duration, UNIX_EPOCH}; + +use crate::checks::{list_timeline_blobs, BlobDataParseResult}; +use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; +use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; +use aws_sdk_s3::Client; +use futures_util::{StreamExt, TryStreamExt}; +use pageserver::tenant::remote_timeline_client::parse_remote_index_path; +use pageserver::tenant::IndexPart; +use pageserver_api::shard::TenantShardId; +use remote_storage::RemotePath; +use serde::Serialize; +use tracing::{info_span, Instrument}; +use utils::generation::Generation; + +#[derive(Serialize, Default)] +pub struct GcSummary { + indices_deleted: usize, + remote_storage_errors: usize, +} + +#[derive(clap::ValueEnum, Debug, Clone, Copy)] +pub enum GcMode { + // Delete nothing + DryRun, + + // Enable only removing old-generation indices + IndicesOnly, + // Enable all forms of GC + // TODO: this will be used when shard split ancestor layer deletion is added + // All, +} + +impl std::fmt::Display for GcMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + GcMode::DryRun => write!(f, "dry-run"), + GcMode::IndicesOnly => write!(f, "indices-only"), + } + } +} + +async fn maybe_delete_index( + s3_client: &Client, + bucket_config: &BucketConfig, + min_age: &Duration, + latest_gen: Generation, + key: &str, + mode: GcMode, + summary: &mut GcSummary, +) { + // Validation: we will only delete things that parse cleanly + let basename = key.rsplit_once('/').unwrap().1; + let candidate_generation = + match parse_remote_index_path(RemotePath::from_string(basename).unwrap()) { + Some(g) => g, + None => { + if basename == IndexPart::FILE_NAME { + // A legacy pre-generation index + Generation::none() + } else { + // A strange key: we will not delete this because we don't understand it. + tracing::warn!("Bad index key"); + return; + } + } + }; + + // Validation: we will only delete indices more than one generation old, to avoid interfering + // in typical migrations, even if they are very long running. + if candidate_generation >= latest_gen { + // This shouldn't happen: when we loaded metadata, it should have selected the latest + // generation already, and only populated [`S3TimelineBlobData::unused_index_keys`] + // with older generations. + tracing::warn!("Deletion candidate is >= latest generation, this is a bug!"); + return; + } else if candidate_generation.next() == latest_gen { + // Skip deleting the latest-1th generation's index. + return; + } + + // Validation: we will only delete indices after one week, so that during incidents we will have + // easy access to recent indices. + let age: Duration = match s3_client + .head_object() + .bucket(&bucket_config.bucket) + .key(key) + .send() + .await + { + Ok(response) => match response.last_modified { + None => { + tracing::warn!("Missing last_modified"); + summary.remote_storage_errors += 1; + return; + } + Some(last_modified) => { + let last_modified = + UNIX_EPOCH + Duration::from_secs_f64(last_modified.as_secs_f64()); + match last_modified.elapsed() { + Ok(e) => e, + Err(_) => { + tracing::warn!("Bad last_modified time: {last_modified:?}"); + return; + } + } + } + }, + Err(e) => { + tracing::warn!("Failed to HEAD {key}: {e}"); + summary.remote_storage_errors += 1; + return; + } + }; + if &age < min_age { + tracing::info!( + "Skipping young object {} < {}", + age.as_secs_f64(), + min_age.as_secs_f64() + ); + return; + } + + if matches!(mode, GcMode::DryRun) { + tracing::info!("Dry run: would delete this key"); + return; + } + + // All validations passed: erase the object + match s3_client + .delete_object() + .bucket(&bucket_config.bucket) + .key(key) + .send() + .await + { + Ok(_) => { + tracing::info!("Successfully deleted index"); + summary.indices_deleted += 1; + } + Err(e) => { + tracing::warn!("Failed to delete index: {e}"); + summary.remote_storage_errors += 1; + } + } +} + +/// Physical garbage collection: removing unused S3 objects. This is distinct from the garbage collection +/// done inside the pageserver, which operates at a higher level (keys, layers). This type of garbage collection +/// is about removing: +/// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between +/// uploading a layer and uploading an index) +/// - Index objects from historic generations +/// +/// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and +/// make sure that object listings don't get slowed down by large numbers of garbage objects. +pub async fn pageserver_physical_gc( + bucket_config: BucketConfig, + tenant_ids: Vec, + min_age: Duration, + mode: GcMode, +) -> anyhow::Result { + let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + + let tenants = if tenant_ids.is_empty() { + futures::future::Either::Left(stream_tenants(&s3_client, &target)) + } else { + futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok))) + }; + + // How many tenants to process in parallel. We need to be mindful of pageservers + // accessing the same per tenant prefixes, so use a lower setting than pageservers. + const CONCURRENCY: usize = 32; + + // Generate a stream of TenantTimelineId + let timelines = tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, t)); + let timelines = timelines.try_buffered(CONCURRENCY); + let timelines = timelines.try_flatten(); + + // Generate a stream of S3TimelineBlobData + async fn gc_timeline( + s3_client: &Client, + bucket_config: &BucketConfig, + min_age: &Duration, + target: &RootTarget, + mode: GcMode, + ttid: TenantShardTimelineId, + ) -> anyhow::Result { + let mut summary = GcSummary::default(); + let data = list_timeline_blobs(s3_client, ttid, target).await?; + + let (latest_gen, candidates) = match &data.blob_data { + BlobDataParseResult::Parsed { + index_part: _index_part, + index_part_generation, + s3_layers: _s3_layers, + } => (*index_part_generation, data.unused_index_keys), + BlobDataParseResult::Relic => { + // Post-deletion tenant location: don't try and GC it. + return Ok(summary); + } + BlobDataParseResult::Incorrect(reasons) => { + // Our primary purpose isn't to report on bad data, but log this rather than skipping silently + tracing::warn!("Skipping timeline {ttid}, bad metadata: {reasons:?}"); + return Ok(summary); + } + }; + + for key in candidates { + maybe_delete_index( + s3_client, + bucket_config, + min_age, + latest_gen, + &key, + mode, + &mut summary, + ) + .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, key)) + .await; + } + + Ok(summary) + } + let timelines = timelines + .map_ok(|ttid| gc_timeline(&s3_client, &bucket_config, &min_age, &target, mode, ttid)); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + + let mut summary = GcSummary::default(); + + while let Some(i) = timelines.next().await { + let tl_summary = i?; + + summary.indices_deleted += tl_summary.indices_deleted; + summary.remote_storage_errors += tl_summary.remote_storage_errors; + } + + Ok(summary) +} diff --git a/s3_scrubber/src/scan_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs similarity index 98% rename from s3_scrubber/src/scan_metadata.rs rename to storage_scrubber/src/scan_pageserver_metadata.rs index 4b63bb3884..af74ffa4cd 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -7,7 +7,7 @@ use crate::checks::{ use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; use aws_sdk_s3::Client; -use futures_util::{pin_mut, StreamExt, TryStreamExt}; +use futures_util::{StreamExt, TryStreamExt}; use histogram::Histogram; use pageserver::tenant::remote_timeline_client::remote_layer_path; use pageserver::tenant::IndexPart; @@ -125,7 +125,7 @@ impl MetadataSummary { { *self .indices_by_version - .entry(index_part.get_version()) + .entry(index_part.version()) .or_insert(0) += 1; if let Err(e) = self.update_histograms(index_part) { @@ -226,7 +226,7 @@ pub async fn scan_metadata( Ok((ttid, data)) } let timelines = timelines.map_ok(|ttid| report_on_timeline(&s3_client, &target, ttid)); - let timelines = timelines.try_buffered(CONCURRENCY); + let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); // We must gather all the TenantShardTimelineId->S3TimelineBlobData for each tenant, because different // shards in the same tenant might refer to one anothers' keys if a shard split has happened. @@ -309,7 +309,6 @@ pub async fn scan_metadata( // all results for the same tenant will be adjacent. We accumulate these, // and then call `analyze_tenant` to flush, when we see the next tenant ID. let mut summary = MetadataSummary::new(); - pin_mut!(timelines); while let Some(i) = timelines.next().await { let (ttid, data) = i?; summary.update_data(&data); diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs new file mode 100644 index 0000000000..24051b03de --- /dev/null +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -0,0 +1,248 @@ +use std::{collections::HashSet, str::FromStr, sync::Arc}; + +use aws_sdk_s3::Client; +use futures::stream::{StreamExt, TryStreamExt}; +use once_cell::sync::OnceCell; +use pageserver_api::shard::TenantShardId; +use postgres_ffi::{XLogFileName, PG_TLI}; +use serde::Serialize; +use tokio_postgres::types::PgLsn; +use tracing::{error, info, trace}; +use utils::{ + id::{TenantId, TenantTimelineId, TimelineId}, + lsn::Lsn, +}; + +use crate::{ + cloud_admin_api::CloudAdminApiClient, init_remote, metadata_stream::stream_listing, + BucketConfig, ConsoleConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; + +/// Generally we should ask safekeepers, but so far we use everywhere default 16MB. +const WAL_SEGSIZE: usize = 16 * 1024 * 1024; + +#[derive(Serialize)] +pub struct MetadataSummary { + timeline_count: usize, + with_errors: HashSet, + deleted_count: usize, +} + +impl MetadataSummary { + fn new() -> Self { + Self { + timeline_count: 0, + with_errors: HashSet::new(), + deleted_count: 0, + } + } + + pub fn summary_string(&self) -> String { + format!( + "timeline_count: {}, with_errors: {}", + self.timeline_count, + self.with_errors.len() + ) + } + + pub fn is_empty(&self) -> bool { + self.timeline_count == 0 + } + + pub fn is_fatal(&self) -> bool { + !self.with_errors.is_empty() + } +} + +/// Scan the safekeeper metadata in an S3 bucket, reporting errors and +/// statistics. +/// +/// It works by listing timelines along with timeline_start_lsn and backup_lsn +/// in debug dump in dump_db_table and verifying its s3 contents. If some WAL +/// segments are missing, before complaining control plane is queried to check if +/// the project wasn't deleted in the meanwhile. +pub async fn scan_safekeeper_metadata( + bucket_config: BucketConfig, + tenant_ids: Vec, + dump_db_connstr: String, + dump_db_table: String, +) -> anyhow::Result { + info!( + "checking bucket {}, region {}, dump_db_table {}", + bucket_config.bucket, bucket_config.region, dump_db_table + ); + // Use rustls (Neon requires TLS) + let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); + let client_config = rustls::ClientConfig::builder() + .with_root_certificates(root_store) + .with_no_client_auth(); + let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); + let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let tenant_filter_clause = if !tenant_ids.is_empty() { + format!( + "and tenant_id in ({})", + tenant_ids + .iter() + .map(|t| format!("'{}'", t)) + .collect::>() + .join(", ") + ) + } else { + "".to_owned() + }; + let query = format!( + "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;", + dump_db_table, tenant_filter_clause, + ); + info!("query is {}", query); + let timelines = client.query(&query, &[]).await?; + info!("loaded {} timelines", timelines.len()); + + let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?; + let console_config = ConsoleConfig::from_env()?; + let cloud_admin_api_client = CloudAdminApiClient::new(console_config); + + let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| { + let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id"); + let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id"); + let timeline_start_lsn_pg: PgLsn = row.get(2); + let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg)); + let backup_lsn_pg: PgLsn = row.get(3); + let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg)); + let ttid = TenantTimelineId::new(tenant_id, timeline_id); + check_timeline( + &s3_client, + &target, + &cloud_admin_api_client, + ttid, + timeline_start_lsn, + backup_lsn, + ) + }); + // Run multiple check_timeline's concurrently. + const CONCURRENCY: usize = 32; + let mut timelines = checks.try_buffered(CONCURRENCY); + + let mut summary = MetadataSummary::new(); + while let Some(r) = timelines.next().await { + let res = r?; + summary.timeline_count += 1; + if !res.is_ok { + summary.with_errors.insert(res.ttid); + } + if res.is_deleted { + summary.deleted_count += 1; + } + } + + Ok(summary) +} + +struct TimelineCheckResult { + ttid: TenantTimelineId, + is_ok: bool, + is_deleted: bool, // timeline is deleted in cplane +} + +/// List s3 and check that is has all expected WAL for the ttid. Consistency +/// errors are logged to stderr; returns Ok(true) if timeline is consistent, +/// Ok(false) if not, Err if failed to check. +async fn check_timeline( + s3_client: &Client, + root: &RootTarget, + api_client: &CloudAdminApiClient, + ttid: TenantTimelineId, + timeline_start_lsn: Lsn, + backup_lsn: Lsn, +) -> anyhow::Result { + trace!( + "checking ttid {}, should contain WAL [{}-{}]", + ttid, + timeline_start_lsn, + backup_lsn + ); + // calculate expected segfiles + let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE); + let expected_last_segno = backup_lsn.segment_number(WAL_SEGSIZE); + let mut expected_segfiles: HashSet = HashSet::from_iter( + (expected_first_segno..expected_last_segno) + .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)), + ); + let expected_files_num = expected_segfiles.len(); + trace!("expecting {} files", expected_segfiles.len(),); + + // now list s3 and check if it misses something + let ttshid = + TenantShardTimelineId::new(TenantShardId::unsharded(ttid.tenant_id), ttid.timeline_id); + let mut timeline_dir_target = root.timeline_root(&ttshid); + // stream_listing yields only common_prefixes if delimiter is not empty, but + // we need files, so unset it. + timeline_dir_target.delimiter = String::new(); + + let mut stream = std::pin::pin!(stream_listing(s3_client, &timeline_dir_target)); + while let Some(obj) = stream.next().await { + let obj = obj?; + let key = obj.key(); + + let seg_name = key + .strip_prefix(&timeline_dir_target.prefix_in_bucket) + .expect("failed to extract segment name"); + expected_segfiles.remove(seg_name); + } + if !expected_segfiles.is_empty() { + // Before complaining check cplane, probably timeline is already deleted. + let bdata = api_client + .find_timeline_branch(ttid.tenant_id, ttid.timeline_id) + .await?; + let deleted = match bdata { + Some(bdata) => bdata.deleted, + None => { + // note: should be careful with selecting proper cplane address + info!("ttid {} not found, assuming it is deleted", ttid); + true + } + }; + if deleted { + // ok, branch is deleted + return Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: true, + }); + } + error!( + "ttid {}: missing {} files out of {}, timeline_start_lsn {}, wal_backup_lsn {}", + ttid, + expected_segfiles.len(), + expected_files_num, + timeline_start_lsn, + backup_lsn, + ); + return Ok(TimelineCheckResult { + ttid, + is_ok: false, + is_deleted: false, + }); + } + Ok(TimelineCheckResult { + ttid, + is_ok: true, + is_deleted: false, + }) +} + +fn load_certs() -> Result, std::io::Error> { + let der_certs = rustls_native_certs::load_native_certs()?; + let mut store = rustls::RootCertStore::empty(); + store.add_parsable_certificates(der_certs); + Ok(Arc::new(store)) +} +static TLS_ROOTS: OnceCell> = OnceCell::new(); diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs new file mode 100644 index 0000000000..450b337235 --- /dev/null +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -0,0 +1,290 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use crate::checks::{list_timeline_blobs, BlobDataParseResult, S3TimelineBlobData}; +use crate::metadata_stream::{stream_tenant_shards, stream_tenant_timelines}; +use crate::{ + download_object_to_file, init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, +}; +use anyhow::Context; +use async_stream::stream; +use aws_sdk_s3::Client; +use camino::Utf8PathBuf; +use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::storage_layer::LayerName; +use pageserver::tenant::IndexPart; +use pageserver_api::shard::TenantShardId; +use utils::generation::Generation; +use utils::id::TenantId; + +pub struct SnapshotDownloader { + s3_client: Arc, + s3_root: RootTarget, + bucket_config: BucketConfig, + tenant_id: TenantId, + output_path: Utf8PathBuf, + concurrency: usize, +} + +impl SnapshotDownloader { + pub fn new( + bucket_config: BucketConfig, + tenant_id: TenantId, + output_path: Utf8PathBuf, + concurrency: usize, + ) -> anyhow::Result { + let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + Ok(Self { + s3_client, + s3_root, + bucket_config, + tenant_id, + output_path, + concurrency, + }) + } + + async fn download_layer( + &self, + ttid: TenantShardTimelineId, + layer_name: LayerName, + layer_metadata: LayerFileMetadata, + ) -> anyhow::Result<(LayerName, LayerFileMetadata)> { + // Note this is local as in a local copy of S3 data, not local as in the pageserver's local format. They use + // different layer names (remote-style has the generation suffix) + let local_path = self.output_path.join(format!( + "{}/timelines/{}/{}{}", + ttid.tenant_shard_id, + ttid.timeline_id, + layer_name, + layer_metadata.generation.get_suffix() + )); + + // We should only be called for layers that are owned by the input TTID + assert_eq!(layer_metadata.shard, ttid.tenant_shard_id.to_index()); + + // Assumption: we always write layer files atomically, and layer files are immutable. Therefore if the file + // already exists on local disk, we assume it is fully correct and skip it. + if tokio::fs::try_exists(&local_path).await? { + tracing::debug!("{} already exists", local_path); + return Ok((layer_name, layer_metadata)); + } else { + tracing::debug!("{} requires download...", local_path); + + let timeline_root = self.s3_root.timeline_root(&ttid); + let remote_layer_path = format!( + "{}{}{}", + timeline_root.prefix_in_bucket, + layer_name, + layer_metadata.generation.get_suffix() + ); + + // List versions: the object might be deleted. + let versions = self + .s3_client + .list_object_versions() + .bucket(self.bucket_config.bucket.clone()) + .prefix(&remote_layer_path) + .send() + .await?; + let Some(version) = versions.versions.as_ref().and_then(|v| v.first()) else { + return Err(anyhow::anyhow!("No versions found for {remote_layer_path}")); + }; + download_object_to_file( + &self.s3_client, + &self.bucket_config.bucket, + &remote_layer_path, + version.version_id.as_deref(), + &local_path, + ) + .await?; + + tracing::debug!("Downloaded successfully to {local_path}"); + } + + Ok((layer_name, layer_metadata)) + } + + /// Download many layers belonging to the same TTID, with some concurrency + async fn download_layers( + &self, + ttid: TenantShardTimelineId, + layers: Vec<(LayerName, LayerFileMetadata)>, + ) -> anyhow::Result<()> { + let layer_count = layers.len(); + tracing::info!("Downloading {} layers for timeline {ttid}...", layer_count); + let layers_stream = stream! { + for (layer_name, layer_metadata) in layers { + yield self.download_layer(ttid, layer_name, layer_metadata); + } + }; + + tokio::fs::create_dir_all(self.output_path.join(format!( + "{}/timelines/{}", + ttid.tenant_shard_id, ttid.timeline_id + ))) + .await?; + + let layer_results = layers_stream.buffered(self.concurrency); + let mut layer_results = std::pin::pin!(layer_results); + + let mut err = None; + let mut download_count = 0; + while let Some(i) = layer_results.next().await { + download_count += 1; + match i { + Ok((layer_name, layer_metadata)) => { + tracing::info!( + "[{download_count}/{layer_count}] OK: {} bytes {ttid} {}", + layer_metadata.file_size, + layer_name + ); + } + Err(e) => { + // Warn and continue: we will download what we can + tracing::warn!("Download error: {e}"); + err = Some(e); + } + } + } + if let Some(e) = err { + tracing::warn!("Some errors occurred downloading {ttid} layers, last error: {e}"); + Err(e) + } else { + Ok(()) + } + } + + async fn download_timeline( + &self, + ttid: TenantShardTimelineId, + index_part: Box, + index_part_generation: Generation, + ancestor_layers: &mut HashMap>, + ) -> anyhow::Result<()> { + let index_bytes = serde_json::to_string(&index_part).unwrap(); + + let layers = index_part + .layer_metadata + .into_iter() + .filter_map(|(layer_name, layer_metadata)| { + if layer_metadata.shard.shard_count != ttid.tenant_shard_id.shard_count { + // Accumulate ancestor layers for later download + let ancestor_ttid = TenantShardTimelineId::new( + TenantShardId { + tenant_id: ttid.tenant_shard_id.tenant_id, + shard_number: layer_metadata.shard.shard_number, + shard_count: layer_metadata.shard.shard_count, + }, + ttid.timeline_id, + ); + let ancestor_ttid_layers = ancestor_layers.entry(ancestor_ttid).or_default(); + use std::collections::hash_map::Entry; + match ancestor_ttid_layers.entry(layer_name) { + Entry::Occupied(entry) => { + // Descendent shards that reference a layer from an ancestor should always have matching metadata, + // as their siblings, because it is read atomically during a shard split. + assert_eq!(entry.get(), &layer_metadata); + } + Entry::Vacant(entry) => { + entry.insert(layer_metadata); + } + } + None + } else { + Some((layer_name, layer_metadata)) + } + }) + .collect(); + + let download_result = self.download_layers(ttid, layers).await; + + // Write index last, once all the layers it references are downloaded + let local_index_path = self.output_path.join(format!( + "{}/timelines/{}/index_part.json{}", + ttid.tenant_shard_id, + ttid.timeline_id, + index_part_generation.get_suffix() + )); + tokio::fs::write(&local_index_path, index_bytes) + .await + .context("writing index")?; + + download_result + } + + pub async fn download(&self) -> anyhow::Result<()> { + let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?; + + // Generate a stream of TenantShardId + let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?; + let shards: Vec = shards.try_collect().await?; + + // Only read from shards that have the highest count: avoids redundantly downloading + // from ancestor shards. + let Some(shard_count) = shards.iter().map(|s| s.shard_count).max() else { + anyhow::bail!("No shards found"); + }; + + // We will build a collection of layers in anccestor shards to download (this will only + // happen if this tenant has been split at some point) + let mut ancestor_layers: HashMap< + TenantShardTimelineId, + HashMap, + > = Default::default(); + + for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { + // Generate a stream of TenantTimelineId + let timelines = stream_tenant_timelines(&s3_client, &self.s3_root, shard).await?; + + // Generate a stream of S3TimelineBlobData + async fn load_timeline_index( + s3_client: &Client, + target: &RootTarget, + ttid: TenantShardTimelineId, + ) -> anyhow::Result<(TenantShardTimelineId, S3TimelineBlobData)> { + let data = list_timeline_blobs(s3_client, ttid, target).await?; + Ok((ttid, data)) + } + let timelines = timelines.map_ok(|ttid| load_timeline_index(&s3_client, &target, ttid)); + let mut timelines = std::pin::pin!(timelines.try_buffered(8)); + + while let Some(i) = timelines.next().await { + let (ttid, data) = i?; + match data.blob_data { + BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _, + } => { + self.download_timeline( + ttid, + index_part, + index_part_generation, + &mut ancestor_layers, + ) + .await + .context("Downloading timeline")?; + } + BlobDataParseResult::Relic => {} + BlobDataParseResult::Incorrect(_) => { + tracing::error!("Bad metadata in timeline {ttid}"); + } + }; + } + } + + for (ttid, layers) in ancestor_layers.into_iter() { + tracing::info!( + "Downloading {} layers from ancvestor timeline {ttid}...", + layers.len() + ); + + self.download_layers(ttid, layers.into_iter().collect()) + .await?; + } + + Ok(()) + } +} diff --git a/test_runner/README.md b/test_runner/README.md index 96e74659ce..7d95634ea8 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -76,13 +76,10 @@ you can use `--pg-version` argument. `TEST_OUTPUT`: Set the directory where test state and test output files should go. `TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests. -`NEON_PAGESERVER_OVERRIDES`: add a `;`-separated set of configs that will be passed as `RUST_LOG`: logging configuration to pass into Neon CLI Useful parameters and commands: -`--pageserver-config-override=${value}` `-c` values to pass into pageserver through neon_local cli - `--preserve-database-files` to preserve pageserver (layer) and safekeer (segment) timeline files on disk after running a test suite. Such files might be large, so removed by default; but might be useful for debugging or creation of svg images with layer file contents. @@ -95,6 +92,166 @@ Exit after the first test failure: `./scripts/pytest -x ...` (there are many more pytest options; run `pytest -h` to see them.) +#### Running Python tests against real S3 or S3-compatible services + +Neon's `libs/remote_storage` supports multiple implementations of remote storage. +At the time of writing, that is +```rust +pub enum RemoteStorageKind { + /// Storage based on local file system. + /// Specify a root folder to place all stored files into. + LocalFs(Utf8PathBuf), + /// AWS S3 based storage, storing all files in the S3 bucket + /// specified by the config + AwsS3(S3Config), + /// Azure Blob based storage, storing all files in the container + /// specified by the config + AzureContainer(AzureConfig), +} +``` + +The test suite has a Python enum with equal name but different meaning: + +```python +@enum.unique +class RemoteStorageKind(str, enum.Enum): + LOCAL_FS = "local_fs" + MOCK_S3 = "mock_s3" + REAL_S3 = "real_s3" +``` + +* `LOCAL_FS` => `LocalFs` +* `MOCK_S3`: starts [`moto`](https://github.com/getmoto/moto)'s S3 implementation, then configures Pageserver with `AwsS3` +* `REAL_S3` => configure `AwsS3` as detailed below + +When a test in the test suite needs an `AwsS3`, it is supposed to call `remote_storage.s3_storage()`. +That function checks env var `ENABLE_REAL_S3_REMOTE_STORAGE`: +* If it is not set, use `MOCK_S3` +* If it is set, use `REAL_S3`. + +For `REAL_S3`, the test suite creates the dict/toml representation of the `RemoteStorageKind::AwsS3` based on env vars: + +```rust +pub struct S3Config { + // test suite env var: REMOTE_STORAGE_S3_BUCKET + pub bucket_name: String, + // test suite env var: REMOTE_STORAGE_S3_REGION + pub bucket_region: String, + // test suite determines this + pub prefix_in_bucket: Option, + // no env var exists; test suite sets it for MOCK_S3, because that's how moto works + pub endpoint: Option, + ... +} +``` + +*Credentials* are not part of the config, but discovered by the AWS SDK. +See the `libs/remote_storage` Rust code. +We're documenting two mechanism here: + +The test suite supports two mechanisms (`remote_storage.py`): + +**Credential mechanism 1**: env vars `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. +Populate the env vars with AWS access keys that you created in IAM. +Our CI uses this mechanism. +However, it is _not_ recommended for interactive use by developers ([learn more](https://docs.aws.amazon.com/sdkref/latest/guide/access-users.html#credentials-long-term)). +Instead, use profiles (next section). + +**Credential mechanism 2**: env var `AWS_PROFILE`. +This uses the AWS SDK's (and CLI's) profile mechanism. +Learn more about it [in the official docs](https://docs.aws.amazon.com/sdkref/latest/guide/file-format.html). +After configuring a profile (e.g. via the aws CLI), set the env var to its name. + +In conclusion, the full command line is: + +```bash +# with long-term AWS access keys +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_ACCESS_KEY_ID=... \ +AWS_SECRET_ACCESS_KEY=... \ +./scripts/pytest +``` + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=eu-central-1 \ +AWS_PROFILE=... \ +./scripts/pytest +``` + +If you're using SSO, make sure to `aws sso login --profile $AWS_PROFILE` first. + +##### Minio + +If you want to run test without the cloud setup, we recommend [minio](https://min.io/docs/minio/linux/index.html). + +```bash +# Start in Terminal 1 +mkdir /tmp/minio_data +minio server /tmp/minio_data --console-address 127.0.0.1:9001 --address 127.0.0.1:9000 +``` + +In another terminal, create an `aws` CLI profile for it: + +```ini +# append to ~/.aws/config +[profile local-minio] +services = local-minio-services +[services local-minio-services] +s3 = + endpoint_url=http://127.0.0.1:9000/ +``` + + +Now configure the credentials (this is going to write `~/.aws/credentials` for you). +It's an interactive prompt. + +```bash +# Terminal 2 +$ aws --profile local-minio configure +AWS Access Key ID [None]: minioadmin +AWS Secret Access Key [None]: minioadmin +Default region name [None]: +Default output format [None]: +``` + +Now create a bucket `testbucket` using the CLI. + +```bash +# (don't forget to have AWS_PROFILE env var set; or use --profile) +aws --profile local-minio s3 mb s3://mybucket +``` + +(If it doesn't work, make sure you update your AWS CLI to a recent version. + The [service-specific endpoint feature](https://docs.aws.amazon.com/sdkref/latest/guide/feature-ss-endpoints.html) + that we're using is quite new.) + +```bash +# with AWS PROFILE +ENABLE_REAL_S3_REMOTE_STORAGE=true \ +REMOTE_STORAGE_S3_BUCKET=mybucket \ +REMOTE_STORAGE_S3_REGION=doesntmatterforminio \ +AWS_PROFILE=local-minio \ +./scripts/pytest +``` + +NB: you can avoid the `--profile` by setting the `AWS_PROFILE` variable. +Just like the AWS SDKs, the `aws` CLI is sensible to it. + +#### Running Rust tests against real S3 or S3-compatible services + +We have some Rust tests that only run against real S3, e.g., [here](https://github.com/neondatabase/neon/blob/c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a/libs/remote_storage/tests/test_real_s3.rs#L392-L397). + +They use the same env vars as the Python test suite (see previous section) +but interpret them on their own. +However, at this time, the interpretation is identical. + +So, above instructions apply to the Rust test as well. + ### Writing a test Every test needs a Neon Environment, or NeonEnv to operate in. A Neon Environment @@ -128,6 +285,21 @@ def test_foobar(neon_env_builder: NeonEnvBuilder): ... ``` +The env includes a default tenant and timeline. Therefore, you do not need to create your own +tenant/timeline for testing. + +```python +def test_foobar2(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Start the environment + with env.endpoints.create_start("main") as endpoint: + # Start the compute endpoint + client = env.pageserver.http_client() # Get the pageserver client + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id) +``` + For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html At the end of a test, all the nodes in the environment are automatically stopped, so you diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 200c9c3740..4b0c9ac71d 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -2,6 +2,7 @@ pytest_plugins = ( "fixtures.pg_version", "fixtures.parametrize", "fixtures.httpserver", + "fixtures.compute_reconfigure", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index 297f2c6da7..038f557cc8 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -12,14 +12,16 @@ from pathlib import Path # Type-related stuff from typing import Callable, ClassVar, Dict, Iterator, Optional +import allure import pytest from _pytest.config import Config from _pytest.config.argparsing import Parser +from _pytest.fixtures import FixtureRequest from _pytest.terminal import TerminalReporter +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonPageserver -from fixtures.types import TenantId, TimelineId """ This file contains fixtures for micro-benchmarks. @@ -411,7 +413,10 @@ class NeonBenchmarker: @pytest.fixture(scope="function") -def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[NeonBenchmarker]: +def zenbenchmark( + request: FixtureRequest, + record_property: Callable[[str, object], None], +) -> Iterator[NeonBenchmarker]: """ This is a python decorator for benchmark fixtures. It contains functions for recording measurements, and prints them out at the end. @@ -419,6 +424,21 @@ def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[Neo benchmarker = NeonBenchmarker(record_property) yield benchmarker + results = {} + for _, recorded_property in request.node.user_properties: + name = recorded_property["name"] + value = str(recorded_property["value"]) + if (unit := recorded_property["unit"].strip()) != "": + value += f" {unit}" + results[name] = value + + content = json.dumps(results, indent=2) + allure.attach( + content, + "benchmarks.json", + allure.attachment_type.JSON, + ) + def pytest_addoption(parser: Parser): parser.addoption( @@ -462,20 +482,18 @@ def pytest_terminal_summary( terminalreporter.section("Benchmark results", "-") is_header_printed = True - terminalreporter.write( - "{}.{}: ".format(test_report.head_line, recorded_property["name"]) - ) + terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ") unit = recorded_property["unit"] value = recorded_property["value"] if unit == "MB": - terminalreporter.write("{0:,.0f}".format(value), green=True) + terminalreporter.write(f"{value:,.0f}", green=True) elif unit in ("s", "ms") and isinstance(value, float): - terminalreporter.write("{0:,.3f}".format(value), green=True) + terminalreporter.write(f"{value:,.3f}", green=True) elif isinstance(value, float): - terminalreporter.write("{0:,.4f}".format(value), green=True) + terminalreporter.write(f"{value:,.4f}", green=True) else: terminalreporter.write(str(value), green=True) - terminalreporter.line(" {}".format(unit)) + terminalreporter.line(f" {unit}") result_entry.append(recorded_property) diff --git a/test_runner/fixtures/broker.py b/test_runner/fixtures/broker.py index fa8b816e69..8aca90a097 100644 --- a/test_runner/fixtures/broker.py +++ b/test_runner/fixtures/broker.py @@ -54,7 +54,10 @@ class NeonBroker: else: break # success - def stop(self): + def stop(self, immediate: bool = False): if self.handle is not None: - self.handle.terminate() + if immediate: + self.handle.kill() + else: + self.handle.terminate() self.handle.wait() diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/common_types.py similarity index 83% rename from test_runner/fixtures/types.py rename to test_runner/fixtures/common_types.py index ea648e460d..147264762c 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/common_types.py @@ -5,6 +5,8 @@ from typing import Any, Type, TypeVar, Union T = TypeVar("T", bound="Id") +DEFAULT_WAL_SEG_SIZE = 16 * 1024 * 1024 + @total_ordering class Lsn: @@ -67,6 +69,21 @@ class Lsn: def as_int(self) -> int: return self.lsn_int + def segment_lsn(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> "Lsn": + return Lsn(self.lsn_int - (self.lsn_int % seg_sz)) + + def segno(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> int: + return self.lsn_int // seg_sz + + def segment_name(self, seg_sz: int = DEFAULT_WAL_SEG_SIZE) -> str: + segno = self.segno(seg_sz) + # The filename format is 00000001XXXXXXXX000000YY, where XXXXXXXXYY is segno in hex. + # XXXXXXXX is the higher 8 hex digits of segno + high_bits = segno >> 8 + # YY is the lower 2 hex digits of segno + low_bits = segno & 0xFF + return f"00000001{high_bits:08X}000000{low_bits:02X}" + @dataclass(frozen=True) class Key: @@ -156,7 +173,14 @@ class TenantShardId: raise ValueError(f"Invalid TenantShardId '{input}'") def __str__(self): - return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + if self.shard_count > 0: + return f"{self.tenant_id}-{self.shard_number:02x}{self.shard_count:02x}" + else: + # Unsharded case: equivalent of Rust TenantShardId::unsharded(tenant_id) + return str(self.tenant_id) + + def __repr__(self): + return self.__str__() def _tuple(self) -> tuple[TenantId, int, int]: return (self.tenant_id, self.shard_number, self.shard_count) diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 6fbaa08512..429b6af548 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -155,12 +155,23 @@ class NeonCompare(PgCompare): "size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER ) - metric_filters = {"tenant_id": str(self.tenant), "timeline_id": str(self.timeline)} + metric_filters = { + "tenant_id": str(self.tenant), + "timeline_id": str(self.timeline), + "file_kind": "layer", + "op_kind": "upload", + } + # use `started` (not `finished`) counters here, because some callers + # don't wait for upload queue to drain total_files = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_created_persistent_files_total", metric_filters + self.env.pageserver, + "pageserver_remote_timeline_client_calls_started_total", + metric_filters, ) total_bytes = self.zenbenchmark.get_int_counter_value( - self.env.pageserver, "pageserver_written_persistent_bytes_total", metric_filters + self.env.pageserver, + "pageserver_remote_timeline_client_bytes_started_total", + metric_filters, ) self.zenbenchmark.record( "data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py new file mode 100644 index 0000000000..66fc35b6aa --- /dev/null +++ b/test_runner/fixtures/compute_reconfigure.py @@ -0,0 +1,73 @@ +import concurrent.futures +from typing import Any + +import pytest +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + +from fixtures.common_types import TenantId +from fixtures.log_helper import log + + +class ComputeReconfigure: + def __init__(self, server): + self.server = server + self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" + self.workloads = {} + self.on_notify = None + + def register_workload(self, workload): + self.workloads[workload.tenant_id] = workload + + def register_on_notify(self, fn): + """ + Add some extra work during a notification, like sleeping to slow things down, or + logging what was notified. + """ + self.on_notify = fn + + +@pytest.fixture(scope="function") +def compute_reconfigure_listener(make_httpserver): + """ + This fixture exposes an HTTP listener for the storage controller to submit + compute notifications to us, instead of updating neon_local endpoints itself. + + Although storage controller can use neon_local directly, this causes problems when + the test is also concurrently modifying endpoints. Instead, configure storage controller + to send notifications up to this test code, which will route all endpoint updates + through Workload, which has a mutex to make concurrent updates safe. + """ + server = make_httpserver + + self = ComputeReconfigure(server) + + # Do neon_local endpoint reconfiguration in the background so that we can + # accept a healthy rate of calls into notify-attach. + reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1) + + def handler(request: Request): + assert request.json is not None + body: dict[str, Any] = request.json + log.info(f"notify-attach request: {body}") + + if self.on_notify is not None: + self.on_notify(body) + + try: + workload = self.workloads[TenantId(body["tenant_id"])] + except KeyError: + pass + else: + # This causes the endpoint to query storage controller for its location, which + # is redundant since we already have it here, but this avoids extending the + # neon_local CLI to take full lists of locations + reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[no-any-return] + + return Response(status=200) + + self.server.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) + + yield self + reconfigure_threads.shutdown() + server.clear() diff --git a/test_runner/fixtures/endpoint/__init__.py b/test_runner/fixtures/endpoint/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py new file mode 100644 index 0000000000..42f0539c19 --- /dev/null +++ b/test_runner/fixtures/endpoint/http.py @@ -0,0 +1,23 @@ +import requests +from requests.adapters import HTTPAdapter + + +class EndpointHttpClient(requests.Session): + def __init__( + self, + port: int, + ): + super().__init__() + self.port = port + + self.mount("http://", HTTPAdapter()) + + def dbs_and_roles(self): + res = self.get(f"http://localhost:{self.port}/dbs_and_roles") + res.raise_for_status() + return res.json() + + def database_schema(self, database: str): + res = self.get(f"http://localhost:{self.port}/database_schema?database={database}") + res.raise_for_status() + return res.text diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index a6a25da332..8b8075f8c1 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Tuple from prometheus_client.parser import text_string_to_metric_families from prometheus_client.samples import Sample +from fixtures.log_helper import log + class Metrics: metrics: Dict[str, List[Sample]] @@ -16,6 +18,7 @@ class Metrics: def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]: filter = filter or {} res = [] + for sample in self.metrics[name]: try: if all(sample.labels[k] == v for k, v in filter.items()): @@ -30,6 +33,60 @@ class Metrics: return res[0] +class MetricsGetter: + """ + Mixin for types that implement a `get_metrics` function and would like associated + helpers for querying the metrics + """ + + def get_metrics(self) -> Metrics: + raise NotImplementedError() + + def get_metric_value( + self, name: str, filter: Optional[Dict[str, str]] = None + ) -> Optional[float]: + metrics = self.get_metrics() + results = metrics.query_all(name, filter=filter) + if not results: + log.info(f'could not find metric "{name}"') + return None + assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" + return results[0].value + + def get_metrics_values( + self, names: list[str], filter: Optional[Dict[str, str]] = None, absence_ok=False + ) -> Dict[str, float]: + """ + When fetching multiple named metrics, it is more efficient to use this + than to call `get_metric_value` repeatedly. + + Throws RuntimeError if no metrics matching `names` are found, or if + not all of `names` are found: this method is intended for loading sets + of metrics whose existence is coupled. + + If it's expected that there may be no results for some of the metrics, + specify `absence_ok=True`. The returned dict will then not contain values + for these metrics. + """ + metrics = self.get_metrics() + samples = [] + for name in names: + samples.extend(metrics.query_all(name, filter=filter)) + + result = {} + for sample in samples: + if sample.name in result: + raise RuntimeError(f"Multiple values found for {sample.name}") + result[sample.name] = sample.value + + if not absence_ok: + if len(result) != len(names): + log.info(f"Metrics found: {metrics.metrics}") + raise RuntimeError(f"could not find all metrics {' '.join(names)}") + + return result + + def parse_metrics(text: str, name: str = "") -> Metrics: metrics = Metrics(name) gen = text_string_to_metric_families(text) @@ -46,7 +103,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]: PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( - "pageserver_remote_timeline_client_calls_unfinished", + "pageserver_remote_timeline_client_calls_started_total", + "pageserver_remote_timeline_client_calls_finished_total", "pageserver_remote_physical_size", "pageserver_remote_timeline_client_bytes_started_total", "pageserver_remote_timeline_client_bytes_finished_total", @@ -71,11 +129,10 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_read_num_fs_layers"), + *histogram("pageserver_layers_visited_per_read_global"), *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), - *histogram("pageserver_remote_timeline_client_calls_started"), *histogram("pageserver_io_operations_seconds"), "pageserver_tenant_states_count", ) @@ -85,15 +142,16 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_resident_physical_size", "pageserver_io_operations_bytes_total", "pageserver_last_record_lsn", + "pageserver_standby_horizon", "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", - "pageserver_created_persistent_files_total", - "pageserver_written_persistent_bytes_total", "pageserver_evictions_total", "pageserver_evictions_with_low_residence_duration_total", + "pageserver_aux_file_estimated_size", *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, - # pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload + # "pageserver_directory_entries_count", -- only used if above a certain threshold + # "pageserver_broken_tenants_count" -- used only for broken ) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 001d4e23a9..49857d5151 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2,6 +2,7 @@ from __future__ import annotations import abc import asyncio +import concurrent.futures import filecmp import json import os @@ -10,20 +11,24 @@ import shutil import subprocess import tempfile import textwrap +import threading import time import uuid from contextlib import closing, contextmanager -from dataclasses import dataclass, field +from dataclasses import dataclass from datetime import datetime -from functools import cached_property +from enum import Enum +from fcntl import LOCK_EX, LOCK_UN, flock +from functools import cached_property, partial from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, cast -from urllib.parse import urlparse +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast +from urllib.parse import quote, urlparse import asyncpg import backoff +import httpx import jwt import psycopg2 import pytest @@ -40,15 +45,23 @@ from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal from urllib3.util.retry import Retry +from fixtures import overlayfs from fixtures.broker import NeonBroker +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pageserver.allowed_errors import ( DEFAULT_PAGESERVER_ALLOWED_ERRORS, - scan_pageserver_log_for_errors, + DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) +from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_layer_file_name from fixtures.pageserver.http import PageserverHttpClient -from fixtures.pageserver.types import IndexPartDump -from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pageserver.utils import ( + wait_for_last_record_lsn, + wait_for_upload, + wait_for_upload_queue_empty, +) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( @@ -58,17 +71,21 @@ from fixtures.remote_storage import ( RemoteStorageUser, S3Storage, default_remote_storage, - remote_storage_to_toml_inline_table, + remote_storage_to_toml_dict, ) -from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.utils import are_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, allure_add_grafana_links, allure_attach_from_dir, + assert_no_errors, get_self_dir, + print_gc_result, subprocess_capture, wait_until, ) +from fixtures.utils import AuxFileStore as AuxFileStore # reexport """ This file contains pytest fixtures. A fixture is a test resource that can be @@ -379,7 +396,8 @@ class PgProtocol: class AuthKeys: priv: str - def generate_token(self, *, scope: str, **token_data: str) -> str: + def generate_token(self, *, scope: TokenScope, **token_data: Any) -> str: + token_data = {key: str(val) for key, val in token_data.items()} token = jwt.encode({"scope": scope, **token_data}, self.priv, algorithm="EdDSA") # cast(Any, self.priv) @@ -392,14 +410,23 @@ class AuthKeys: return token def generate_pageserver_token(self) -> str: - return self.generate_token(scope="pageserverapi") + return self.generate_token(scope=TokenScope.PAGE_SERVER_API) def generate_safekeeper_token(self) -> str: - return self.generate_token(scope="safekeeperdata") + return self.generate_token(scope=TokenScope.SAFEKEEPER_DATA) # generate token giving access to only one tenant def generate_tenant_token(self, tenant_id: TenantId) -> str: - return self.generate_token(scope="tenant", tenant_id=str(tenant_id)) + return self.generate_token(scope=TokenScope.TENANT, tenant_id=str(tenant_id)) + + +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class TokenScope(str, Enum): + ADMIN = "admin" + PAGE_SERVER_API = "pageserverapi" + GENERATIONS_API = "generations_api" + SAFEKEEPER_DATA = "safekeeperdata" + TENANT = "tenant" class NeonEnvBuilder: @@ -423,9 +450,12 @@ class NeonEnvBuilder: pg_distrib_dir: Path, pg_version: PgVersion, test_name: str, + top_output_dir: Path, test_output_dir: Path, + test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, - pageserver_config_override: Optional[str] = None, + # toml that will be decomposed into `--config-override` flags during `pageserver --init` + pageserver_config_override: Optional[str | Callable[[Dict[str, Any]], None]] = None, num_safekeepers: int = 1, num_pageservers: int = 1, # Use non-standard SK ids to check for various parsing bugs @@ -438,6 +468,9 @@ class NeonEnvBuilder: preserve_database_files: bool = False, initial_tenant: Optional[TenantId] = None, initial_timeline: Optional[TimelineId] = None, + pageserver_virtual_file_io_engine: Optional[str] = None, + pageserver_aux_file_policy: Optional[AuxFileStore] = None, + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -461,6 +494,7 @@ class NeonEnvBuilder: self.env: Optional[NeonEnv] = None self.keep_remote_storage_contents: bool = True self.neon_binpath = neon_binpath + self.neon_local_binpath = neon_binpath self.pg_distrib_dir = pg_distrib_dir self.pg_version = pg_version self.preserve_database_files = preserve_database_files @@ -468,6 +502,39 @@ class NeonEnvBuilder: self.initial_timeline = initial_timeline or TimelineId.generate() self.scrub_on_exit = False self.test_output_dir = test_output_dir + self.test_overlay_dir = test_overlay_dir + self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = [] + self.config_init_force: Optional[str] = None + self.top_output_dir = top_output_dir + self.control_plane_compute_hook_api: Optional[str] = None + self.storage_controller_config: Optional[dict[Any, Any]] = None + + self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine + + self.pageserver_default_tenant_config_compaction_algorithm: Optional[ + Dict[str, Any] + ] = pageserver_default_tenant_config_compaction_algorithm + if self.pageserver_default_tenant_config_compaction_algorithm is not None: + log.debug( + f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" + ) + + self.pageserver_get_vectored_impl: Optional[str] = None + if os.getenv("PAGESERVER_GET_VECTORED_IMPL", "") == "vectored": + self.pageserver_get_vectored_impl = "vectored" + log.debug('Overriding pageserver get_vectored_impl config to "vectored"') + + self.pageserver_get_impl: Optional[str] = None + if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored": + self.pageserver_get_impl = "vectored" + log.debug('Overriding pageserver get_impl config to "vectored"') + + self.pageserver_validate_vectored_get: Optional[bool] = None + if (validate := os.getenv("PAGESERVER_VALIDATE_VEC_GET")) is not None: + self.pageserver_validate_vectored_get = bool(validate) + log.debug(f'Overriding pageserver validate_vectored_get config to "{validate}"') + + self.pageserver_aux_file_policy = pageserver_aux_file_policy assert test_name.startswith( "test_" @@ -488,8 +555,10 @@ class NeonEnvBuilder: def init_start( self, - initial_tenant_conf: Optional[Dict[str, str]] = None, + initial_tenant_conf: Optional[Dict[str, Any]] = None, default_remote_storage_if_missing: bool = True, + initial_tenant_shard_count: Optional[int] = None, + initial_tenant_shard_stripe_size: Optional[int] = None, ) -> NeonEnv: """ Default way to create and start NeonEnv. Also creates the initial_tenant with root initial_timeline. @@ -507,7 +576,12 @@ class NeonEnvBuilder: f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline" ) initial_tenant, initial_timeline = env.neon_cli.create_tenant( - tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline + tenant_id=env.initial_tenant, + conf=initial_tenant_conf, + timeline_id=env.initial_timeline, + shard_count=initial_tenant_shard_count, + shard_stripe_size=initial_tenant_shard_stripe_size, + aux_file_v2=self.pageserver_aux_file_policy, ) assert env.initial_tenant == initial_tenant assert env.initial_timeline == initial_timeline @@ -515,20 +589,72 @@ class NeonEnvBuilder: return env + def build_and_use_snapshot( + self, global_ident: str, create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv] + ) -> NeonEnv: + if os.getenv("CI", "false") == "true": + log.info("do not use snapshots in ephemeral CI environment") + env = create_env_for_snapshot(self) + env.stop(immediate=True, ps_assert_metric_no_errors=False) + return env + + with shared_snapshot_dir(self.top_output_dir, global_ident) as snapshot_dir: + if not snapshot_dir.is_initialized(): + self._build_and_use_snapshot_impl(snapshot_dir, create_env_for_snapshot) + assert snapshot_dir.is_initialized() + + return self.from_repo_dir(snapshot_dir.path) + + def _build_and_use_snapshot_impl( + self, + snapshot_dir: SnapshotDirLocked, + create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv], + ): + if snapshot_dir.path.exists(): + shutil.rmtree(snapshot_dir.path) + + if self.test_overlay_dir is not None: + # Make repo_dir an overlayfs mount with lowerdir being the empty snapshot_dir. + # When we're done filling up repo_dir, tear everything down, unmount the overlayfs, and use + # the upperdir as the snapshot. This is equivalent to docker `FROM scratch`. + assert not self.repo_dir.exists() + assert self.repo_dir.parent.exists() + snapshot_dir.path.mkdir() + self.overlay_mount("create-snapshot-repo-dir", snapshot_dir.path, self.repo_dir) + self.config_init_force = "empty-dir-ok" + + env = create_env_for_snapshot(self) + assert self.env is not None + assert self.env == env + + # shut down everything for snapshot + env.stop(immediate=True, ps_assert_metric_no_errors=True) + + # TODO: all kinds of assertions to ensure the env is unused + + if self.test_overlay_dir is None: + log.info("take snapshot by moving repo dir") + env.repo_dir.rename(snapshot_dir.path) + else: + log.info("take snapshot by using overlayfs upperdir") + self.overlay_unmount_and_move("create-snapshot-repo-dir", snapshot_dir.path) + log.info("remove empty repo_dir (previously mountpoint) for snapshot overlay_mount") + env.repo_dir.rmdir() + # TODO from here on, we should be able to reset / goto top where snapshot_dir.is_initialized() + log.info("make repo_dir an overlayfs mount of the snapshot we just created") + assert not env.repo_dir.exists(), "both branches above should remove it" + snapshot_dir.set_initialized() + + self.env = None # so that from_repo_dir works again + def from_repo_dir( self, repo_dir: Path, - neon_binpath: Optional[Path] = None, - pg_distrib_dir: Optional[Path] = None, ) -> NeonEnv: """ A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir. """ - # Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests - self.neon_binpath = neon_binpath or self.neon_binpath - self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir - # Get the initial tenant and timeline from the snapshot config snapshot_config_toml = repo_dir / "config" with snapshot_config_toml.open("r") as f: @@ -546,8 +672,16 @@ class NeonEnvBuilder: tenants_from_dir = ps_dir / "tenants" tenants_to_dir = self.repo_dir / ps_dir.name / "tenants" - log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}") - shutil.copytree(tenants_from_dir, tenants_to_dir) + if self.test_overlay_dir is None: + log.info( + f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" + ) + shutil.copytree(tenants_from_dir, tenants_to_dir) + else: + log.info( + f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}" + ) + self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir) for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"): sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name @@ -556,9 +690,18 @@ class NeonEnvBuilder: shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid")) shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) - shutil.copytree( - repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" - ) + if self.test_overlay_dir is None: + log.info("Copying local_fs_remote_storage directory from snapshot") + shutil.copytree( + repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" + ) + else: + log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot") + self.overlay_mount( + "local_fs_remote_storage", + repo_dir / "local_fs_remote_storage", + self.repo_dir / "local_fs_remote_storage", + ) if (attachments_json := Path(repo_dir / "attachments.json")).exists(): shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) @@ -570,15 +713,127 @@ class NeonEnvBuilder: config["default_tenant_id"] = snapshot_config["default_tenant_id"] config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] + # Update the config with new neon + postgres path in case of compat test + config["pg_distrib_dir"] = str(self.pg_distrib_dir) + config["neon_distrib_dir"] = str(self.neon_binpath) + with (self.repo_dir / "config").open("w") as f: toml.dump(config, f) return self.env + def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): + """ + Mount `srcdir` as an overlayfs mount at `dstdir`. + The overlayfs `upperdir` and `workdir` will be placed in test_overlay_dir. + """ + assert self.test_overlay_dir + assert ( + self.test_output_dir in dstdir.parents + ) # so that teardown & test_overlay_dir fixture work + assert srcdir.is_dir() + dstdir.mkdir(exist_ok=False, parents=False) + ident_state_dir = self.test_overlay_dir / ident + upper = ident_state_dir / "upper" + work = ident_state_dir / "work" + ident_state_dir.mkdir( + exist_ok=False, parents=False + ) # exists_ok=False also checks uniqueness in self.overlay_mounts + upper.mkdir() + work.mkdir() + cmd = [ + "sudo", + "mount", + "-t", + "overlay", + "overlay", + "-o", + f"lowerdir={srcdir},upperdir={upper},workdir={work}", + str(dstdir), + ] + log.info(f"Mounting overlayfs srcdir={srcdir} dstdir={dstdir}: {cmd}") + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + self.overlay_mounts_created_by_us.append((ident, dstdir)) + + def _overlay_umount(self, mountpoint: Path): + cmd = ["sudo", "umount", str(mountpoint)] + assert mountpoint.is_mount() + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + + def overlay_unmount_and_move(self, ident: str, dst: Path): + """ + Unmount previously established overlayfs mount at `dstdir` and move the upperdir contents to `dst`. + If `dst` is an empty directory, it gets replaced. + Caller is responsible for ensuring the unmount will succeed, i.e., that there aren't any nested mounts. + + Raises exception if self.test_overlay_dir is None + """ + assert self.test_overlay_dir is not None + # not mutating state yet, make checks + ident_state_dir = self.test_overlay_dir / ident + assert ident_state_dir.is_dir() + upper = ident_state_dir / "upper" + work = ident_state_dir / "work" + assert upper.is_dir() + assert work.is_dir() + assert ( + self.test_overlay_dir not in dst.parents + ), "otherwise workdir cleanup below wouldn't work" + # find index, still not mutating state + idxmap = { + existing_ident: idx + for idx, (existing_ident, _) in enumerate(self.overlay_mounts_created_by_us) + } + idx = idxmap.get(ident) + if idx is None: + raise RuntimeError(f"cannot find mount for ident {ident}") + + if dst.is_dir(): + dst.rmdir() # raises exception if not empty, which is what we want + + _, mountpoint = self.overlay_mounts_created_by_us.pop(idx) + self._overlay_umount(mountpoint) + upper.rename(dst) + # we moved the upperdir, clean up workdir and then its parent ident_state_dir + cmd = ["sudo", "rm", "-rf", str(work)] + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + ident_state_dir.rmdir() # should be empty since we moved `upper` out + + def overlay_cleanup_teardown(self): + """ + Unmount the overlayfs mounts created by `self.overlay_mount()`. + Supposed to be called during env teardown. + """ + if self.test_overlay_dir is None: + return + while len(self.overlay_mounts_created_by_us) > 0: + (ident, mountpoint) = self.overlay_mounts_created_by_us.pop() + ident_state_dir = self.test_overlay_dir / ident + log.info( + f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}" + ) + self._overlay_umount(mountpoint) + log.info( + f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}" + ) + cmd = ["sudo", "rm", "-rf", str(ident_state_dir)] + subprocess_capture( + self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True + ) + + # assert all overlayfs mounts in our test directory are gone + assert [] == list(overlayfs.iter_mounts_beneath(self.test_overlay_dir)) + def enable_scrub_on_exit(self): """ Call this if you would like the fixture to automatically run - s3_scrubber at the end of the test, as a bidirectional test + storage_scrubber at the end of the test, as a bidirectional test that the scrubber is working properly, and that the code within the test didn't produce any invalid remote state. """ @@ -641,8 +896,15 @@ class NeonEnvBuilder: if self.preserve_database_files: return + overlayfs_mounts = {mountpoint for _, mountpoint in self.overlay_mounts_created_by_us} + directories_to_clean: List[Path] = [] for test_entry in Path(self.repo_dir).glob("**/*"): + if test_entry in overlayfs_mounts: + continue + for parent in test_entry.parents: + if parent in overlayfs_mounts: + continue if test_entry.is_file(): test_file = test_entry if ATTACHMENT_NAME_REGEX.fullmatch(test_file.name): @@ -676,22 +938,17 @@ class NeonEnvBuilder: # Stop all the nodes. if self.env: log.info("Cleaning up all storage and compute nodes") - self.env.endpoints.stop_all() - for sk in self.env.safekeepers: - sk.stop(immediate=True) - - for pageserver in self.env.pageservers: - pageserver.assert_no_metric_errors() - - pageserver.stop(immediate=True) - - self.env.attachment_service.stop(immediate=True) - + self.env.stop( + immediate=True, + # if the test threw an exception, don't check for errors + # as a failing assertion would cause the cleanup below to fail + ps_assert_metric_no_errors=(exc_type is None), + ) cleanup_error = None if self.scrub_on_exit: try: - S3Scrubber(self.test_output_dir, self).scan_metadata() + StorageScrubber(self).scan_metadata() except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -716,6 +973,18 @@ class NeonEnvBuilder: for pageserver in self.env.pageservers: pageserver.assert_no_errors() + for safekeeper in self.env.safekeepers: + safekeeper.assert_no_errors() + + self.env.storage_controller.assert_no_errors() + + try: + self.overlay_cleanup_teardown() + except Exception as e: + log.error(f"Error cleaning up overlay state: {e}") + if cleanup_error is not None: + cleanup_error = e + class NeonEnv: """ @@ -732,7 +1001,7 @@ class NeonEnv: Some notable functions and fields in NeonEnv: - postgres - A factory object for creating postgres compute nodes. + endpoints - A factory object for creating postgres compute nodes. pageservers - An array containing objects representing the pageservers @@ -767,35 +1036,62 @@ class NeonEnv: self.pg_version = config.pg_version # Binary path for pageserver, safekeeper, etc self.neon_binpath = config.neon_binpath - # Binary path for neon_local test-specific binaries: may be overridden - # after construction for compat testing - self.neon_local_binpath = config.neon_binpath + # Binary path for neon_local test-specific binaries + self.neon_local_binpath = config.neon_local_binpath + if self.neon_local_binpath is None: + self.neon_local_binpath = self.neon_binpath self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 - self.pageserver_config_override = config.pageserver_config_override + self.storage_controller_config = config.storage_controller_config # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline - attachment_service_port = self.port_distributor.get_port() - self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}" - self.attachment_service: NeonAttachmentService = NeonAttachmentService(self) + # Find two adjacent ports for storage controller and its postgres DB. This + # loop would eventually throw from get_port() if we run out of ports (extremely + # unlikely): usually we find two adjacent free ports on the first iteration. + while True: + self.storage_controller_port = self.port_distributor.get_port() + storage_controller_pg_port = self.port_distributor.get_port() + if storage_controller_pg_port == self.storage_controller_port + 1: + break - # Create a config file corresponding to the options + # The URL for the pageserver to use as its control_plane_api config + self.control_plane_api: str = f"http://127.0.0.1:{self.storage_controller_port}/upcall/v1" + # The base URL of the storage controller + self.storage_controller_api: str = f"http://127.0.0.1:{self.storage_controller_port}" + + # For testing this with a fake HTTP server, enable passing through a URL from config + self.control_plane_compute_hook_api = config.control_plane_compute_hook_api + + self.storage_controller: NeonStorageController = NeonStorageController( + self, config.auth_enabled + ) + + self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine + self.pageserver_aux_file_policy = config.pageserver_aux_file_policy + + # Create the neon_local's `NeonLocalInitConf` cfg: Dict[str, Any] = { "default_tenant_id": str(self.initial_tenant), "broker": { "listen_addr": self.broker.listen_addr(), }, - "pageservers": [], "safekeepers": [], + "pageservers": [], } if self.control_plane_api is not None: cfg["control_plane_api"] = self.control_plane_api + if self.control_plane_compute_hook_api is not None: + cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api + + if self.storage_controller_config is not None: + cfg["storage_controller"] = self.storage_controller_config + # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -814,13 +1110,41 @@ class NeonEnv: "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, } + if self.pageserver_virtual_file_io_engine is not None: + ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine + if config.pageserver_get_vectored_impl is not None: + ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl + if config.pageserver_get_impl is not None: + ps_cfg["get_impl"] = config.pageserver_get_impl + if config.pageserver_validate_vectored_get is not None: + ps_cfg["validate_vectored_get"] = config.pageserver_validate_vectored_get + if config.pageserver_default_tenant_config_compaction_algorithm is not None: + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config[ + "compaction_algorithm" + ] = config.pageserver_default_tenant_config_compaction_algorithm + + if self.pageserver_remote_storage is not None: + ps_cfg["remote_storage"] = remote_storage_to_toml_dict( + self.pageserver_remote_storage + ) + + if config.pageserver_config_override is not None: + if callable(config.pageserver_config_override): + config.pageserver_config_override(ps_cfg) + else: + assert isinstance(config.pageserver_config_override, str) + for o in config.pageserver_config_override.split(";"): + override = toml.loads(o) + for key, value in override.items(): + ps_cfg[key] = value + # Create a corresponding NeonPageserver object self.pageservers.append( NeonPageserver( self, ps_id, port=pageserver_port, - config_override=self.pageserver_config_override, ) ) cfg["pageservers"].append(ps_cfg) @@ -848,19 +1172,55 @@ class NeonEnv: cfg["safekeepers"].append(sk_cfg) log.info(f"Config: {cfg}") - self.neon_cli.init(cfg) + self.neon_cli.init( + cfg, + force=config.config_init_force, + ) def start(self): + # Storage controller starts first, so that pageserver /re-attach calls don't + # bounce through retries on startup + self.storage_controller.start() + + # Wait for storage controller readiness to prevent unnecessary post start-up + # reconcile. + self.storage_controller.wait_until_ready() + # Start up broker, pageserver and all safekeepers - self.broker.try_start() + futs = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=2 + len(self.pageservers) + len(self.safekeepers) + ) as executor: + futs.append( + executor.submit(lambda: self.broker.try_start() or None) + ) # The `or None` is for the linter - self.attachment_service.start() + for pageserver in self.pageservers: + futs.append(executor.submit(lambda ps=pageserver: ps.start())) + for safekeeper in self.safekeepers: + futs.append(executor.submit(lambda sk=safekeeper: sk.start())) + + for f in futs: + f.result() + + def stop(self, immediate=False, ps_assert_metric_no_errors=False): + """ + After this method returns, there should be no child processes running. + """ + self.endpoints.stop_all() + + # Stop storage controller before pageservers: we don't want it to spuriously + # detect a pageserver "failure" during test teardown + self.storage_controller.stop(immediate=immediate) + + for sk in self.safekeepers: + sk.stop(immediate=immediate) for pageserver in self.pageservers: - pageserver.start() - - for safekeeper in self.safekeepers: - safekeeper.start() + if ps_assert_metric_no_errors: + pageserver.assert_no_metric_errors() + pageserver.stop(immediate=immediate) + self.broker.stop(immediate=immediate) @property def pageserver(self) -> NeonPageserver: @@ -869,7 +1229,9 @@ class NeonEnv: assert that there is only one. Tests with multiple pageservers should always use get_pageserver with an explicit ID. """ - assert len(self.pageservers) == 1 + assert ( + len(self.pageservers) == 1 + ), "env.pageserver must only be used with single pageserver NeonEnv" return self.pageservers[0] def get_pageserver(self, id: Optional[int]) -> NeonPageserver: @@ -889,6 +1251,17 @@ class NeonEnv: raise RuntimeError(f"Pageserver with ID {id} not found") + def get_tenant_pageserver(self, tenant_id: Union[TenantId, TenantShardId]): + """ + Get the NeonPageserver where this tenant shard is currently attached, according + to the storage controller. + """ + meta = self.storage_controller.inspect(tenant_id) + if meta is None: + return None + pageserver_id = meta[1] + return self.get_pageserver(pageserver_id) + def get_safekeeper_connstrs(self) -> str: """Get list of safekeeper endpoints suitable for safekeepers GUC""" return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers) @@ -953,6 +1326,9 @@ def _shared_simple_env( neon_binpath: Path, pg_distrib_dir: Path, pg_version: PgVersion, + pageserver_virtual_file_io_engine: str, + pageserver_aux_file_policy: Optional[AuxFileStore], + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -970,6 +1346,7 @@ def _shared_simple_env( shutil.rmtree(repo_dir, ignore_errors=True) with NeonEnvBuilder( + top_output_dir=top_output_dir, repo_dir=repo_dir, port_distributor=port_distributor, broker=default_broker, @@ -981,6 +1358,9 @@ def _shared_simple_env( preserve_database_files=pytestconfig.getoption("--preserve-database-files"), test_name=request.node.name, test_output_dir=test_output_dir, + pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, + pageserver_aux_file_policy=pageserver_aux_file_policy, + pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, ) as builder: env = builder.init_start() @@ -1017,6 +1397,11 @@ def neon_env_builder( default_broker: NeonBroker, run_id: uuid.UUID, request: FixtureRequest, + test_overlay_dir: Path, + top_output_dir: Path, + pageserver_virtual_file_io_engine: str, + pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], + pageserver_aux_file_policy: Optional[AuxFileStore], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1036,6 +1421,7 @@ def neon_env_builder( # Return the builder to the caller with NeonEnvBuilder( + top_output_dir=top_output_dir, repo_dir=Path(repo_dir), port_distributor=port_distributor, mock_s3_server=mock_s3_server, @@ -1045,8 +1431,12 @@ def neon_env_builder( broker=default_broker, run_id=run_id, preserve_database_files=pytestconfig.getoption("--preserve-database-files"), + pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, test_name=request.node.name, test_output_dir=test_output_dir, + test_overlay_dir=test_overlay_dir, + pageserver_aux_file_policy=pageserver_aux_file_policy, + pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, ) as builder: yield builder @@ -1113,7 +1503,6 @@ class AbstractNeonCli(abc.ABC): args = [bin_neon] + arguments log.info('Running command "{}"'.format(" ".join(args))) - log.info(f'Running in "{self.env.repo_dir}"') env_vars = os.environ.copy() env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) @@ -1130,15 +1519,29 @@ class AbstractNeonCli(abc.ABC): env_vars[var] = val # Intercept CalledProcessError and print more info - res = subprocess.run( - args, - env=env_vars, - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - timeout=timeout, - ) + try: + res = subprocess.run( + args, + env=env_vars, + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout, + ) + except subprocess.TimeoutExpired as e: + if e.stderr: + stderr = e.stderr.decode(errors="replace") + else: + stderr = "" + + if e.stdout: + stdout = e.stdout.decode(errors="replace") + else: + stdout = "" + + log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}") + raise indent = " " if not res.returncode: @@ -1188,8 +1591,12 @@ class NeonCli(AbstractNeonCli): self, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None, - conf: Optional[Dict[str, str]] = None, + conf: Optional[Dict[str, Any]] = None, + shard_count: Optional[int] = None, + shard_stripe_size: Optional[int] = None, + placement_policy: Optional[str] = None, set_default: bool = False, + aux_file_v2: Optional[AuxFileStore] = None, ) -> Tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. @@ -1213,13 +1620,37 @@ class NeonCli(AbstractNeonCli): product(["-c"], (f"{key}:{value}" for key, value in conf.items())) ) ) + + if aux_file_v2 is AuxFileStore.V2: + args.extend(["-c", "switch_aux_file_policy:v2"]) + + if aux_file_v2 is AuxFileStore.V1: + args.extend(["-c", "switch_aux_file_policy:v1"]) + + if aux_file_v2 is AuxFileStore.CrossValidation: + args.extend(["-c", "switch_aux_file_policy:cross-validation"]) + if set_default: args.append("--set-default") + if shard_count is not None: + args.extend(["--shard-count", str(shard_count)]) + + if shard_stripe_size is not None: + args.extend(["--shard-stripe-size", str(shard_stripe_size)]) + + if placement_policy is not None: + args.extend(["--placement-policy", str(placement_policy)]) + res = self.raw_cli(args) res.check_returncode() return tenant_id, timeline_id + def import_tenant(self, tenant_id: TenantId): + args = ["tenant", "import", "--tenant-id", str(tenant_id)] + res = self.raw_cli(args) + res.check_returncode() + def set_default(self, tenant_id: TenantId): """ Update default tenant for future operations that require tenant_id. @@ -1333,35 +1764,31 @@ class NeonCli(AbstractNeonCli): def init( self, - config: Dict[str, Any], + init_config: Dict[str, Any], + force: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": - with tempfile.NamedTemporaryFile(mode="w+") as tmp: - tmp.write(toml.dumps(config)) - tmp.flush() + with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: + init_config_tmpfile.write(toml.dumps(init_config)) + init_config_tmpfile.flush() - cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] + cmd = [ + "init", + f"--config={init_config_tmpfile.name}", + ] - storage = self.env.pageserver_remote_storage + if force is not None: + cmd.extend(["--force", force]) - append_pageserver_param_overrides( - params_to_update=cmd, - remote_storage=storage, - pageserver_config_override=self.env.pageserver_config_override, - ) - - s3_env_vars = None - if isinstance(storage, S3Storage): - s3_env_vars = storage.access_env_vars() - res = self.raw_cli(cmd, extra_env_vars=s3_env_vars) + res = self.raw_cli(cmd) res.check_returncode() - return res + return res - def attachment_service_start(self): - cmd = ["attachment_service", "start"] + def storage_controller_start(self): + cmd = ["storage_controller", "start"] return self.raw_cli(cmd) - def attachment_service_stop(self, immediate: bool): - cmd = ["attachment_service", "stop"] + def storage_controller_stop(self, immediate: bool): + cmd = ["storage_controller", "stop"] if immediate: cmd.extend(["-m", "immediate"]) return self.raw_cli(cmd) @@ -1369,16 +1796,10 @@ class NeonCli(AbstractNeonCli): def pageserver_start( self, id: int, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, ) -> "subprocess.CompletedProcess[str]": - start_args = ["pageserver", "start", f"--id={id}", *overrides] + start_args = ["pageserver", "start", f"--id={id}"] storage = self.env.pageserver_remote_storage - append_pageserver_param_overrides( - params_to_update=start_args, - remote_storage=storage, - pageserver_config_override=self.env.pageserver_config_override, - ) if isinstance(storage, S3Storage): s3_env_vars = storage.access_env_vars() @@ -1429,6 +1850,7 @@ class NeonCli(AbstractNeonCli): hot_standby: bool = False, lsn: Optional[Lsn] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1452,6 +1874,8 @@ class NeonCli(AbstractNeonCli): args.extend(["--hot-standby", "true"]) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) res = self.raw_cli(args) res.check_returncode() @@ -1463,6 +1887,7 @@ class NeonCli(AbstractNeonCli): safekeepers: Optional[List[int]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1477,6 +1902,8 @@ class NeonCli(AbstractNeonCli): args.append(endpoint_id) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) res = self.raw_cli(args) res.check_returncode() @@ -1501,6 +1928,7 @@ class NeonCli(AbstractNeonCli): endpoint_id: str, destroy=False, check_return_code=True, + mode: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1508,6 +1936,8 @@ class NeonCli(AbstractNeonCli): ] if destroy: args.append("--destroy") + if mode is not None: + args.append(f"--mode={mode}") if endpoint_id is not None: args.append(endpoint_id) @@ -1583,46 +2013,189 @@ class Pagectl(AbstractNeonCli): return IndexPartDump.from_json(parsed) -class NeonAttachmentService: - def __init__(self, env: NeonEnv): +class LogUtils: + """ + A mixin class which provides utilities for inspecting the logs of a service. + """ + + def __init__(self, logfile: Path) -> None: + self.logfile = logfile + + def assert_log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Tuple[str, LogCursor]: + """Convenient for use inside wait_until()""" + + res = self.log_contains(pattern, offset=offset) + assert res is not None + return res + + def log_contains( + self, pattern: str, offset: None | LogCursor = None + ) -> Optional[Tuple[str, LogCursor]]: + """Check that the log contains a line that matches the given regex""" + logfile = self.logfile + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return None + + contains_re = re.compile(pattern) + + # XXX: Our rust logging machinery buffers the messages, so if you + # call this function immediately after it's been logged, there is + # no guarantee it is already present in the log file. This hasn't + # been a problem in practice, our python tests are not fast enough + # to hit that race condition. + skip_until_line_no = 0 if offset is None else offset._line_no + cur_line_no = 0 + with logfile.open("r") as f: + for line in f: + if cur_line_no < skip_until_line_no: + cur_line_no += 1 + continue + elif contains_re.search(line): + # found it! + cur_line_no += 1 + return (line, LogCursor(cur_line_no)) + else: + cur_line_no += 1 + return None + + +class StorageControllerApiException(Exception): + def __init__(self, message, status_code: int): + super().__init__(message) + self.message = message + self.status_code = status_code + + +class NeonStorageController(MetricsGetter, LogUtils): + def __init__(self, env: NeonEnv, auth_enabled: bool): self.env = env self.running = False + self.auth_enabled = auth_enabled + self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS + self.logfile = self.workdir / "storage_controller.log" def start(self): assert not self.running - self.env.neon_cli.attachment_service_start() + self.env.neon_cli.storage_controller_start() self.running = True return self - def stop(self, immediate: bool = False) -> "NeonAttachmentService": + def stop(self, immediate: bool = False) -> "NeonStorageController": if self.running: - self.env.neon_cli.attachment_service_stop(immediate) + self.env.neon_cli.storage_controller_stop(immediate) self.running = False return self - def attach_hook_issue(self, tenant_id: TenantId, pageserver_id: int) -> int: - response = requests.post( - f"{self.env.control_plane_api}/attach-hook", - json={"tenant_id": str(tenant_id), "node_id": pageserver_id}, + @staticmethod + def raise_api_exception(res: requests.Response): + try: + res.raise_for_status() + except requests.RequestException as e: + try: + msg = res.json()["msg"] + except: # noqa: E722 + msg = "" + raise StorageControllerApiException(msg, res.status_code) from e + + def assert_no_errors(self): + assert_no_errors( + self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors + ) + + def pageserver_api(self) -> PageserverHttpClient: + """ + The storage controller implements a subset of the pageserver REST API, for mapping + per-tenant actions into per-shard actions (e.g. timeline creation). Tests should invoke those + functions via the HttpClient, as an implicit check that these APIs remain compatible. + """ + auth_token = None + if self.auth_enabled: + auth_token = self.env.auth_keys.generate_token(scope=TokenScope.PAGE_SERVER_API) + return PageserverHttpClient(self.env.storage_controller_port, lambda: True, auth_token) + + def request(self, method, *args, **kwargs) -> requests.Response: + resp = requests.request(method, *args, **kwargs) + NeonStorageController.raise_api_exception(resp) + + return resp + + def headers(self, scope: Optional[TokenScope]) -> Dict[str, str]: + headers = {} + if self.auth_enabled and scope is not None: + jwt_token = self.env.auth_keys.generate_token(scope=scope) + headers["Authorization"] = f"Bearer {jwt_token}" + + return headers + + def get_metrics(self) -> Metrics: + res = self.request("GET", f"{self.env.storage_controller_api}/metrics") + return parse_metrics(res.text) + + def ready(self) -> bool: + status = None + try: + resp = self.request("GET", f"{self.env.storage_controller_api}/ready") + status = resp.status_code + except StorageControllerApiException as e: + status = e.status_code + + if status == 503: + return False + elif status == 200: + return True + else: + raise RuntimeError(f"Unexpected status {status} from readiness endpoint") + + def wait_until_ready(self): + t1 = time.time() + + def storage_controller_ready(): + assert self.ready() is True + + wait_until(30, 1, storage_controller_ready) + return time.time() - t1 + + def attach_hook_issue( + self, + tenant_shard_id: Union[TenantId, TenantShardId], + pageserver_id: int, + generation_override: Optional[int] = None, + ) -> int: + body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} + if generation_override is not None: + body["generation_override"] = generation_override + + response = self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/attach-hook", + json=body, + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() gen = response.json()["gen"] assert isinstance(gen, int) return gen - def attach_hook_drop(self, tenant_id: TenantId): - response = requests.post( - f"{self.env.control_plane_api}/attach-hook", - json={"tenant_id": str(tenant_id), "node_id": None}, + def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]): + self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/attach-hook", + json={"tenant_shard_id": str(tenant_shard_id), "node_id": None}, + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() - def inspect(self, tenant_id: TenantId) -> Optional[tuple[int, int]]: - response = requests.post( - f"{self.env.control_plane_api}/inspect", - json={"tenant_id": str(tenant_id)}, + def inspect(self, tenant_shard_id: Union[TenantId, TenantShardId]) -> Optional[tuple[int, int]]: + """ + :return: 2-tuple of (generation, pageserver id), or None if unknown + """ + response = self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/inspect", + json={"tenant_shard_id": str(tenant_shard_id)}, + headers=self.headers(TokenScope.ADMIN), ) - response.raise_for_status() json = response.json() log.info(f"Response: {json}") if json["attachment"]: @@ -1631,7 +2204,237 @@ class NeonAttachmentService: else: return None - def __enter__(self) -> "NeonAttachmentService": + def node_register(self, node: NeonPageserver): + body = { + "node_id": int(node.id), + "listen_http_addr": "localhost", + "listen_http_port": node.service_port.http, + "listen_pg_addr": "localhost", + "listen_pg_port": node.service_port.pg, + } + log.info(f"node_register({body})") + self.request( + "POST", + f"{self.env.storage_controller_api}/control/v1/node", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + + def node_drain(self, node_id): + log.info(f"node_drain({node_id})") + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain", + headers=self.headers(TokenScope.ADMIN), + ) + + def node_fill(self, node_id): + log.info(f"node_fill({node_id})") + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill", + headers=self.headers(TokenScope.ADMIN), + ) + + def node_status(self, node_id): + response = self.request( + "GET", + f"{self.env.storage_controller_api}/control/v1/node/{node_id}", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def node_list(self): + response = self.request( + "GET", + f"{self.env.storage_controller_api}/control/v1/node", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def tenant_list(self): + response = self.request( + "GET", + f"{self.env.storage_controller_api}/debug/v1/tenant", + headers=self.headers(TokenScope.ADMIN), + ) + return response.json() + + def node_configure(self, node_id, body: dict[str, Any]): + log.info(f"node_configure({node_id}, {body})") + body["node_id"] = node_id + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/node/{node_id}/config", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + + def tenant_create( + self, + tenant_id: TenantId, + shard_count: Optional[int] = None, + shard_stripe_size: Optional[int] = None, + tenant_config: Optional[Dict[Any, Any]] = None, + placement_policy: Optional[Union[Dict[Any, Any] | str]] = None, + ): + """ + Use this rather than pageserver_api() when you need to include shard parameters + """ + body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)} + + if shard_count is not None: + shard_params = {"count": shard_count} + if shard_stripe_size is not None: + shard_params["stripe_size"] = shard_stripe_size + else: + shard_params["stripe_size"] = 32768 + + body["shard_parameters"] = shard_params + + if tenant_config is not None: + for k, v in tenant_config.items(): + body[k] = v + + body["placement_policy"] = placement_policy + + response = self.request( + "POST", + f"{self.env.storage_controller_api}/v1/tenant", + json=body, + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) + response.raise_for_status() + log.info(f"tenant_create success: {response.json()}") + + def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: + """ + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + """ + response = self.request( + "GET", + f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/locate", + headers=self.headers(TokenScope.ADMIN), + ) + body = response.json() + shards: list[dict[str, Any]] = body["shards"] + return shards + + def tenant_describe(self, tenant_id: TenantId): + """ + :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int} + """ + response = self.request( + "GET", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}", + headers=self.headers(TokenScope.ADMIN), + ) + response.raise_for_status() + return response.json() + + def tenant_shard_split( + self, tenant_id: TenantId, shard_count: int, shard_stripe_size: Optional[int] = None + ) -> list[TenantShardId]: + response = self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/shard_split", + json={"new_shard_count": shard_count, "new_stripe_size": shard_stripe_size}, + headers=self.headers(TokenScope.ADMIN), + ) + body = response.json() + log.info(f"tenant_shard_split success: {body}") + shards: list[TenantShardId] = body["new_shards"] + return shards + + def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int): + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_shard_id}/migrate", + json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id}, + headers=self.headers(TokenScope.ADMIN), + ) + log.info(f"Migrated tenant {tenant_shard_id} to pageserver {dest_ps_id}") + assert self.env.get_tenant_pageserver(tenant_shard_id).id == dest_ps_id + + def tenant_policy_update(self, tenant_id: TenantId, body: dict[str, Any]): + log.info(f"tenant_policy_update({tenant_id}, {body})") + self.request( + "PUT", + f"{self.env.storage_controller_api}/control/v1/tenant/{tenant_id}/policy", + json=body, + headers=self.headers(TokenScope.ADMIN), + ) + + def tenant_import(self, tenant_id: TenantId): + self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/tenant/{tenant_id}/import", + headers=self.headers(TokenScope.ADMIN), + ) + + def reconcile_all(self): + r = self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/reconcile_all", + headers=self.headers(TokenScope.ADMIN), + ) + r.raise_for_status() + n = r.json() + log.info(f"reconcile_all waited for {n} shards") + return n + + def reconcile_until_idle(self, timeout_secs=30): + start_at = time.time() + n = 1 + delay_sec = 0.5 + delay_max = 5 + while n > 0: + n = self.reconcile_all() + if n == 0: + break + elif time.time() - start_at > timeout_secs: + raise RuntimeError("Timeout in reconcile_until_idle") + else: + # Don't call again right away: if we're waiting for many reconciles that + # are blocked on the concurrency limit, it slows things down to call + # reconcile_all frequently. + time.sleep(delay_sec) + delay_sec *= 2 + delay_sec = min(delay_sec, delay_max) + + def consistency_check(self): + """ + Throw an exception if the service finds any inconsistencies in its state + """ + self.request( + "POST", + f"{self.env.storage_controller_api}/debug/v1/consistency_check", + headers=self.headers(TokenScope.ADMIN), + ) + log.info("storage controller passed consistency check") + + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.request( + "PUT", + f"{self.env.storage_controller_api}/debug/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + headers=self.headers(TokenScope.ADMIN), + ) + log.info(f"Got failpoints request response code {res.status_code}") + res.raise_for_status() + + @property + def workdir(self) -> Path: + return self.env.repo_dir + + def __enter__(self) -> "NeonStorageController": return self def __exit__( @@ -1643,24 +2446,26 @@ class NeonAttachmentService: self.stop(immediate=True) -class NeonPageserver(PgProtocol): +@dataclass +class LogCursor: + _line_no: int + + +class NeonPageserver(PgProtocol, LogUtils): """ An object representing a running pageserver. """ TEMP_FILE_SUFFIX = "___temp" - def __init__( - self, env: NeonEnv, id: int, port: PageserverPort, config_override: Optional[str] = None - ): + def __init__(self, env: NeonEnv, id: int, port: PageserverPort): super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env self.id = id self.running = False self.service_port = port - self.config_override = config_override self.version = env.get_binary_version("pageserver") - + self.logfile = self.workdir / "pageserver.log" # After a test finishes, we will scrape the log to see if there are any # unexpected error messages. If your test expects an error, add it to # 'allowed_errors' in the test with something like: @@ -1670,24 +2475,61 @@ class NeonPageserver(PgProtocol): # The entries in the list are regular experessions. self.allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) - def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path: + def timeline_dir( + self, + tenant_shard_id: Union[TenantId, TenantShardId], + timeline_id: Optional[TimelineId] = None, + ) -> Path: """Get a timeline directory's path based on the repo directory of the test environment""" if timeline_id is None: - return self.tenant_dir(tenant_id) / "timelines" - return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id) + return self.tenant_dir(tenant_shard_id) / "timelines" + return self.tenant_dir(tenant_shard_id) / "timelines" / str(timeline_id) def tenant_dir( self, - tenant_id: Optional[TenantId] = None, + tenant_shard_id: Optional[Union[TenantId, TenantShardId]] = None, ) -> Path: """Get a tenant directory's path based on the repo directory of the test environment""" - if tenant_id is None: + if tenant_shard_id is None: return self.workdir / "tenants" - return self.workdir / "tenants" / str(tenant_id) + return self.workdir / "tenants" / str(tenant_shard_id) + + @property + def config_toml_path(self) -> Path: + return self.workdir / "pageserver.toml" + + def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]): + """ + Edit the pageserver's config toml file in place. + """ + path = self.config_toml_path + with open(path, "r") as f: + config = toml.load(f) + edit_fn(config) + with open(path, "w") as f: + toml.dump(config, f) + + def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]: + """ + Non-recursively merge the given `patch` dict into the existing config toml, using `dict.update()`. + Returns the replaced values. + If there was no previous value, the key is mapped to None. + This allows to restore the original value by calling this method with the returned dict. + """ + replacements = {} + + def doit(config: Dict[str, Any]): + while len(patch) > 0: + key, new = patch.popitem() + old = config.get(key, None) + config[key] = new + replacements[key] = old + + self.edit_config_toml(doit) + return replacements def start( self, - overrides: Tuple[str, ...] = (), extra_env_vars: Optional[Dict[str, str]] = None, ) -> "NeonPageserver": """ @@ -1697,9 +2539,7 @@ class NeonPageserver(PgProtocol): """ assert self.running is False - self.env.neon_cli.pageserver_start( - self.id, overrides=overrides, extra_env_vars=extra_env_vars - ) + self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars) self.running = True return self @@ -1777,18 +2617,9 @@ class NeonPageserver(PgProtocol): return self.env.repo_dir / f"pageserver_{self.id}" def assert_no_errors(self): - logfile = self.workdir / "pageserver.log" - if not logfile.exists(): - log.warning(f"Skipping log check: {logfile} does not exist") - return - - with logfile.open("r") as f: - errors = scan_pageserver_log_for_errors(f, self.allowed_errors) - - for _lineno, error in errors: - log.info(f"not allowed error: {error.strip()}") - - assert not errors + assert_no_errors( + self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors + ) def assert_no_metric_errors(self): """ @@ -1805,58 +2636,49 @@ class NeonPageserver(PgProtocol): value = self.http_client().get_metric_value(metric) assert value == 0, f"Nonzero {metric} == {value}" - def log_contains(self, pattern: str) -> Optional[str]: - """Check that the pageserver log contains a line that matches the given regex""" - logfile = self.workdir / "pageserver.log" - if not logfile.exists(): - log.warning(f"Skipping log check: {logfile} does not exist") - return None - - contains_re = re.compile(pattern) - - # XXX: Our rust logging machinery buffers the messages, so if you - # call this function immediately after it's been logged, there is - # no guarantee it is already present in the log file. This hasn't - # been a problem in practice, our python tests are not fast enough - # to hit that race condition. - with logfile.open("r") as f: - for line in f: - if contains_re.search(line): - # found it! - return line - - return None - def tenant_attach( - self, tenant_id: TenantId, config: None | Dict[str, Any] = None, config_null: bool = False + self, + tenant_id: TenantId, + config: None | Dict[str, Any] = None, + config_null: bool = False, + generation: Optional[int] = None, + override_storage_controller_generation: bool = False, ): """ Tenant attachment passes through here to acquire a generation number before proceeding to call into the pageserver HTTP client. """ client = self.http_client() + if generation is None: + generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) + elif override_storage_controller_generation: + generation = self.env.storage_controller.attach_hook_issue( + tenant_id, self.id, generation + ) return client.tenant_attach( tenant_id, config, config_null, - generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id), + generation=generation, ) def tenant_detach(self, tenant_id: TenantId): - self.env.attachment_service.attach_hook_drop(tenant_id) + self.env.storage_controller.attach_hook_drop(tenant_id) client = self.http_client() return client.tenant_detach(tenant_id) def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs): if config["mode"].startswith("Attached") and "generation" not in config: - config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + config["generation"] = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) client = self.http_client() return client.tenant_location_conf(tenant_id, config, **kwargs) - def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]: - path = self.tenant_dir(tenant_id) / "config-v1" + def read_tenant_location_conf( + self, tenant_shard_id: Union[TenantId, TenantShardId] + ) -> dict[str, Any]: + path = self.tenant_dir(tenant_shard_id) / "config-v1" log.info(f"Reading location conf from {path}") bytes = open(path, "r").read() try: @@ -1874,42 +2696,48 @@ class NeonPageserver(PgProtocol): generation: Optional[int] = None, ) -> TenantId: if generation is None: - generation = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) client = self.http_client(auth_token=auth_token) return client.tenant_create(tenant_id, conf, generation=generation) def tenant_load(self, tenant_id: TenantId): client = self.http_client() return client.tenant_load( - tenant_id, generation=self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id) ) + def list_layers( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ) -> list[Path]: + """ + Inspect local storage on a pageserver to discover which layer files are present. -def append_pageserver_param_overrides( - params_to_update: List[str], - remote_storage: Optional[RemoteStorage], - pageserver_config_override: Optional[str] = None, -): - if remote_storage is not None: - remote_storage_toml_table = remote_storage_to_toml_inline_table(remote_storage) + :return: list of relative paths to layers, from the timeline root. + """ + timeline_path = self.timeline_dir(tenant_id, timeline_id) - params_to_update.append( - f"--pageserver-config-override=remote_storage={remote_storage_toml_table}" + def relative(p: Path) -> Path: + return p.relative_to(timeline_path) + + return sorted( + list( + map( + relative, + filter( + lambda path: path.name != "metadata" + and "ephemeral" not in path.name + and "temp" not in path.name, + timeline_path.glob("*"), + ), + ) + ) ) - else: - params_to_update.append('--pageserver-config-override=remote_storage=""') - env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES") - if env_overrides is not None: - params_to_update += [ - f"--pageserver-config-override={o.strip()}" for o in env_overrides.split(";") - ] - - if pageserver_config_override is not None: - params_to_update += [ - f"--pageserver-config-override={o.strip()}" - for o in pageserver_config_override.split(";") - ] + def layer_exists( + self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: LayerName + ) -> bool: + layers = self.list_layers(tenant_id, timeline_id) + return layer_name in [parse_layer_file_name(p.name) for p in layers] class PgBin: @@ -1934,7 +2762,12 @@ class PgBin: env.update(env_add) return env - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): + def run( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[Union[str, Path]] = None, + ): """ Run one of the postgres binaries. @@ -1982,12 +2815,49 @@ class PgBin: ) return base_path + def get_pg_controldata_checkpoint_lsn(self, pgdata: str) -> Lsn: + """ + Run pg_controldata on given datadir and extract checkpoint lsn. + """ + + pg_controldata_path = os.path.join(self.pg_bin_path, "pg_controldata") + cmd = f"{pg_controldata_path} -D {pgdata}" + result = subprocess.run(cmd, capture_output=True, text=True, shell=True) + checkpoint_lsn = re.findall( + "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout + )[0] + log.info(f"last checkpoint at {checkpoint_lsn}") + return Lsn(checkpoint_lsn) + + def take_fullbackup( + self, + pageserver: NeonPageserver, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, + output: Path, + ): + """ + Request fullbackup from pageserver, store it at 'output'. + """ + cmd = [ + "psql", + "--no-psqlrc", + pageserver.connstr(), + "-c", + f"fullbackup {tenant} {timeline} {lsn}", + "-o", + str(output), + ] + self.run_capture(cmd) + @pytest.fixture(scope="function") def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin: return PgBin(test_output_dir, pg_distrib_dir, pg_version) +# TODO make port an optional argument class VanillaPostgres(PgProtocol): def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init: bool = True): super().__init__(host="localhost", port=port, dbname="postgres") @@ -2277,6 +3147,7 @@ class NeonProxy(PgProtocol): self.auth_backend = auth_backend self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval + self.http_timeout_seconds = 15 self._popen: Optional[subprocess.Popen[bytes]] = None def start(self) -> NeonProxy: @@ -2315,6 +3186,7 @@ class NeonProxy(PgProtocol): *["--proxy", f"{self.host}:{self.proxy_port}"], *["--mgmt", f"{self.host}:{self.mgmt_port}"], *["--wss", f"{self.host}:{self.external_http_port}"], + *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"], *["-c", str(crt_path)], *["-k", str(key_path)], *self.auth_backend.extra_args(), @@ -2351,9 +3223,12 @@ class NeonProxy(PgProtocol): def http_query(self, query, args, **kwargs): # TODO maybe use default values if not provided - user = kwargs["user"] - password = kwargs["password"] + user = quote(kwargs["user"]) + password = quote(kwargs["password"]) expected_code = kwargs.get("expected_code") + timeout = kwargs.get("timeout") + + log.info(f"Executing http query: {query}") connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres" response = requests.post( @@ -2365,15 +3240,42 @@ class NeonProxy(PgProtocol): "Neon-Pool-Opt-In": "true", }, verify=str(self.test_output_dir / "proxy.crt"), + timeout=timeout, ) if expected_code is not None: - assert response.status_code == kwargs["expected_code"], f"response: {response.json()}" + assert response.status_code == expected_code, f"response: {response.json()}" return response.json() + async def http2_query(self, query, args, **kwargs): + # TODO maybe use default values if not provided + user = kwargs["user"] + password = kwargs["password"] + expected_code = kwargs.get("expected_code") + + log.info(f"Executing http2 query: {query}") + + connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres" + async with httpx.AsyncClient( + http2=True, verify=str(self.test_output_dir / "proxy.crt") + ) as client: + response = await client.post( + f"https://{self.domain}:{self.external_http_port}/sql", + json={"query": query, "params": args}, + headers={ + "Content-Type": "application/sql", + "Neon-Connection-String": connstr, + "Neon-Pool-Opt-In": "true", + }, + ) + assert response.http_version == "HTTP/2" + + if expected_code is not None: + assert response.status_code == expected_code, f"response: {response.json()}" + return response.json() + def get_metrics(self) -> str: request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") - request_result.raise_for_status() return request_result.text @staticmethod @@ -2520,7 +3422,7 @@ def static_proxy( yield proxy -class Endpoint(PgProtocol): +class Endpoint(PgProtocol, LogUtils): """An object representing a Postgres compute endpoint managed by the control plane.""" def __init__( @@ -2544,6 +3446,19 @@ class Endpoint(PgProtocol): self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers)) # path to conf is /endpoints//pgdata/postgresql.conf + # This lock prevents concurrent start & stop operations, keeping `self.running` consistent + # with whether we're really running. Tests generally wouldn't try and do these concurrently, + # but endpoints are also stopped during test teardown, which might happen concurrently with + # destruction of objects in tests. + self.lock = threading.Lock() + + def http_client( + self, auth_token: Optional[str] = None, retries: Optional[Retry] = None + ) -> EndpointHttpClient: + return EndpointHttpClient( + port=self.http_port, + ) + def create( self, branch_name: str, @@ -2552,6 +3467,7 @@ class Endpoint(PgProtocol): lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, pageserver_id: Optional[int] = None, + allow_multiple: bool = False, ) -> "Endpoint": """ Create a new Postgres endpoint. @@ -2574,21 +3490,28 @@ class Endpoint(PgProtocol): pg_port=self.pg_port, http_port=self.http_port, pageserver_id=pageserver_id, + allow_multiple=allow_multiple, ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) + self.logfile = self.endpoint_path() / "compute.log" config_lines = config_lines or [] # set small 'max_replication_write_lag' to enable backpressure # and make tests more stable. config_lines = ["max_replication_write_lag=15MB"] + config_lines + + config_lines = ["neon.primary_is_running=on"] + config_lines self.config(config_lines) return self def start( - self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None + self, + remote_ext_config: Optional[str] = None, + pageserver_id: Optional[int] = None, + allow_multiple: bool = False, ) -> "Endpoint": """ Start the Postgres instance. @@ -2599,13 +3522,15 @@ class Endpoint(PgProtocol): log.info(f"Starting postgres endpoint {self.endpoint_id}") - self.env.neon_cli.endpoint_start( - self.endpoint_id, - safekeepers=self.active_safekeepers, - remote_ext_config=remote_ext_config, - pageserver_id=pageserver_id, - ) - self.running = True + with self.lock: + self.env.neon_cli.endpoint_start( + self.endpoint_id, + safekeepers=self.active_safekeepers, + remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, + allow_multiple=allow_multiple, + ) + self.running = True return self @@ -2645,6 +3570,17 @@ class Endpoint(PgProtocol): return self + def edit_hba(self, hba: List[str]): + """Prepend hba lines into pg_hba.conf file.""" + with open(os.path.join(self.pg_data_dir_path(), "pg_hba.conf"), "r+") as conf_file: + data = conf_file.read() + conf_file.seek(0) + conf_file.write("\n".join(hba) + "\n") + conf_file.write(data) + + if self.running: + self.safe_psql("SELECT pg_reload_conf()") + def reconfigure(self, pageserver_id: Optional[int] = None): assert self.endpoint_id is not None self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id) @@ -2658,8 +3594,20 @@ class Endpoint(PgProtocol): # Write it back updated with open(config_path, "w") as file: + log.info(json.dumps(dict(data_dict, **kwargs))) json.dump(dict(data_dict, **kwargs), file, indent=4) + # Please note: Migrations only run if pg_skip_catalog_updates is false + def wait_for_migrations(self): + with self.cursor() as cur: + + def check_migrations_done(): + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cur.fetchall()[0][0] + assert migration_id != 0 + + wait_until(20, 0.5, check_migrations_done) + # Mock the extension part of spec passed from control plane for local testing # endpooint.rs adds content of this file as a part of the spec.json def create_remote_extension_spec(self, spec: dict[str, Any]): @@ -2671,33 +3619,39 @@ class Endpoint(PgProtocol): with open(remote_extensions_spec_path, "w") as file: json.dump(spec, file, indent=4) - def stop(self) -> "Endpoint": + def stop(self, mode: str = "fast") -> "Endpoint": """ Stop the Postgres instance if it's running. + + Because test teardown might try and stop an endpoint concurrently with test code + stopping the endpoint, this method is thread safe + Returns self. """ - if self.running: - assert self.endpoint_id is not None - self.env.neon_cli.endpoint_stop( - self.endpoint_id, check_return_code=self.check_stop_result - ) - self.running = False + with self.lock: + if self.running: + assert self.endpoint_id is not None + self.env.neon_cli.endpoint_stop( + self.endpoint_id, check_return_code=self.check_stop_result, mode=mode + ) + self.running = False return self - def stop_and_destroy(self) -> "Endpoint": + def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint": """ Stop the Postgres instance, then destroy the endpoint. Returns self. """ - assert self.endpoint_id is not None - self.env.neon_cli.endpoint_stop( - self.endpoint_id, True, check_return_code=self.check_stop_result - ) - self.endpoint_id = None - self.running = False + with self.lock: + assert self.endpoint_id is not None + self.env.neon_cli.endpoint_stop( + self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode + ) + self.endpoint_id = None + self.running = False return self @@ -2710,6 +3664,7 @@ class Endpoint(PgProtocol): config_lines: Optional[List[str]] = None, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + allow_multiple=False, ) -> "Endpoint": """ Create an endpoint, apply config, and start Postgres. @@ -2725,7 +3680,12 @@ class Endpoint(PgProtocol): hot_standby=hot_standby, lsn=lsn, pageserver_id=pageserver_id, - ).start(remote_ext_config=remote_ext_config) + allow_multiple=allow_multiple, + ).start( + remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, + allow_multiple=allow_multiple, + ) log.info(f"Postgres startup took {time.time() - started_at} seconds") @@ -2864,7 +3824,7 @@ class SafekeeperPort: @dataclass -class Safekeeper: +class Safekeeper(LogUtils): """An object representing a running safekeeper daemon.""" env: NeonEnv @@ -2872,6 +3832,13 @@ class Safekeeper: id: int running: bool = False + def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False): + self.env = env + self.port = port + self.id = id + self.running = running + self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log" + def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper": assert self.running is False self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts) @@ -2894,11 +3861,14 @@ class Safekeeper: return self def stop(self, immediate: bool = False) -> "Safekeeper": - log.info("Stopping safekeeper {}".format(self.id)) + log.info(f"Stopping safekeeper {self.id}") self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self + def assert_no_errors(self): + assert not self.log_contains("manager task finished prematurely") + def append_logical_message( self, tenant_id: TenantId, timeline_id: TimelineId, request: Dict[str, Any] ) -> Dict[str, Any]: @@ -2926,17 +3896,52 @@ class Safekeeper: assert isinstance(res, dict) return res - def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient: + def http_client( + self, auth_token: Optional[str] = None, gen_sk_wide_token: bool = True + ) -> SafekeeperHttpClient: + """ + When auth_token is None but gen_sk_wide is True creates safekeeper wide + token, which is a reasonable default. + """ + if auth_token is None and gen_sk_wide_token: + auth_token = self.env.auth_keys.generate_safekeeper_token() is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper") return SafekeeperHttpClient( port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled ) - def data_dir(self) -> str: - return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}") + def get_timeline_start_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + timeline_status = self.http_client().timeline_status(tenant_id, timeline_id) + timeline_start_lsn = timeline_status.timeline_start_lsn + log.info(f"sk {self.id} timeline start LSN: {timeline_start_lsn}") + return timeline_start_lsn - def timeline_dir(self, tenant_id, timeline_id) -> str: - return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id)) + def get_flush_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + timeline_status = self.http_client().timeline_status(tenant_id, timeline_id) + flush_lsn = timeline_status.flush_lsn + log.info(f"sk {self.id} flush LSN: {flush_lsn}") + return flush_lsn + + def pull_timeline( + self, srcs: list[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId + ) -> Dict[str, Any]: + """ + pull_timeline from srcs to self. + """ + src_https = [f"http://localhost:{sk.port.http}" for sk in srcs] + res = self.http_client().pull_timeline( + {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "http_hosts": src_https} + ) + src_ids = [sk.id for sk in srcs] + log.info(f"finished pulling timeline from {src_ids} to {self.id}") + return res + + @property + def data_dir(self) -> Path: + return self.env.repo_dir / "safekeepers" / f"sk{self.id}" + + def timeline_dir(self, tenant_id, timeline_id) -> Path: + return self.data_dir / str(tenant_id) / str(timeline_id) def list_segments(self, tenant_id, timeline_id) -> List[str]: """ @@ -2949,201 +3954,53 @@ class Safekeeper: segments.sort() return segments - -# Walreceiver as returned by sk's timeline status endpoint. -@dataclass -class Walreceiver: - conn_id: int - state: str - - -@dataclass -class SafekeeperTimelineStatus: - acceptor_epoch: int - pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 - flush_lsn: Lsn - commit_lsn: Lsn - timeline_start_lsn: Lsn - backup_lsn: Lsn - peer_horizon_lsn: Lsn - remote_consistent_lsn: Lsn - walreceivers: List[Walreceiver] - - -@dataclass -class SafekeeperMetrics: - # These are metrics from Prometheus which uses float64 internally. - # As a consequence, values may differ from real original int64s. - flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) - commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) - - -class SafekeeperHttpClient(requests.Session): - HTTPError = requests.HTTPError - - def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): - super().__init__() - self.port = port - self.auth_token = auth_token - self.is_testing_enabled = is_testing_enabled - - if auth_token is not None: - self.headers["Authorization"] = f"Bearer {auth_token}" - - def check_status(self): - self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - - def is_testing_enabled_or_skip(self): - if not self.is_testing_enabled: - pytest.skip("safekeeper was built without 'testing' feature") - - def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): - self.is_testing_enabled_or_skip() - - if isinstance(config_strings, tuple): - pairs = [config_strings] - else: - pairs = config_strings - - log.info(f"Requesting config failpoints: {repr(pairs)}") - - res = self.put( - f"http://localhost:{self.port}/v1/failpoints", - json=[{"name": name, "actions": actions} for name, actions in pairs], - ) - log.info(f"Got failpoints request response code {res.status_code}") - res.raise_for_status() - res_json = res.json() - assert res_json is None - return res_json - - def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: - params = params or {} - res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) - res.raise_for_status() - res_json = json.loads(res.text) - assert isinstance(res_json, dict) - return res_json - - def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]: - res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", - json=body, - ) - res.raise_for_status() - - def timeline_digest( - self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn - ) -> Dict[str, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", - params={ - "from_lsn": str(from_lsn), - "until_lsn": str(until_lsn), - }, - ) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def timeline_create( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 - commit_lsn: Lsn, + def checkpoint_up_to( + self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, wait_wal_removal=True ): - body = { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "pg_version": pg_version, - "commit_lsn": str(commit_lsn), - } - res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) - res.raise_for_status() + """ + Assuming pageserver(s) uploaded to s3 up to `lsn`, + 1) wait for remote_consistent_lsn and wal_backup_lsn on safekeeper to reach it. + 2) checkpoint timeline on safekeeper, which should remove WAL before this LSN; optionally wait for that. + """ + cli = self.http_client() - def timeline_status( - self, tenant_id: TenantId, timeline_id: TimelineId - ) -> SafekeeperTimelineStatus: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") - res.raise_for_status() - resj = res.json() - walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] - return SafekeeperTimelineStatus( - acceptor_epoch=resj["acceptor_state"]["epoch"], - pg_version=resj["pg_info"]["pg_version"], - flush_lsn=Lsn(resj["flush_lsn"]), - commit_lsn=Lsn(resj["commit_lsn"]), - timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), - backup_lsn=Lsn(resj["backup_lsn"]), - peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]), - remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), - walreceivers=walreceivers, - ) + target_segment_file = lsn.segment_name() - def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): - res = self.post( - f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", - json=body, - ) - res.raise_for_status() - - def timeline_delete_force(self, tenant_id: TenantId, timeline_id: TimelineId) -> Dict[Any, Any]: - res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" - ) - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: - res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") - res.raise_for_status() - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def get_metrics_str(self) -> str: - request_result = self.get(f"http://localhost:{self.port}/metrics") - request_result.raise_for_status() - return request_result.text - - def get_metrics(self) -> SafekeeperMetrics: - all_metrics_text = self.get_metrics_str() - - metrics = SafekeeperMetrics() - for match in re.finditer( - r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( - match.group(3) + def are_segments_removed(): + segments = self.list_segments(tenant_id, timeline_id) + log.info( + f"waiting for all segments before {target_segment_file} to be removed from sk {self.id}, current segments: {segments}" ) - for match in re.finditer( - r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', - all_metrics_text, - re.MULTILINE, - ): - metrics.commit_lsn_inexact[ - (TenantId(match.group(1)), TimelineId(match.group(2))) - ] = int(match.group(3)) - return metrics + assert all(target_segment_file <= s for s in segments) + + def are_lsns_advanced(): + stat = cli.timeline_status(tenant_id, timeline_id) + log.info( + f"waiting for remote_consistent_lsn and backup_lsn on sk {self.id} to reach {lsn}, currently remote_consistent_lsn={stat.remote_consistent_lsn}, backup_lsn={stat.backup_lsn}" + ) + assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn() + + # xxx: max wait is long because we might be waiting for reconnection from + # pageserver to this safekeeper + wait_until(30, 1, are_lsns_advanced) + cli.checkpoint(tenant_id, timeline_id) + if wait_wal_removal: + wait_until(30, 1, are_segments_removed) + + def wait_until_paused(self, failpoint: str): + msg = f"at failpoint {failpoint}" + + def paused(): + log.info(f"waiting for hitting failpoint {failpoint}") + self.assert_log_contains(msg) + + wait_until(20, 0.5, paused) -class S3Scrubber: - def __init__(self, log_dir: Path, env: NeonEnvBuilder): +class StorageScrubber: + def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None): self.env = env - self.log_dir = log_dir + self.log_dir = log_dir or env.test_output_dir def scrubber_cli(self, args: list[str], timeout) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) @@ -3160,11 +4017,11 @@ class S3Scrubber: if s3_storage.endpoint is not None: env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) - base_args = [str(self.env.neon_binpath / "s3_scrubber")] + base_args = [str(self.env.neon_binpath / "storage_scrubber")] args = base_args + args (output_path, stdout, status_code) = subprocess_capture( - self.log_dir, + self.env.test_output_dir, args, echo_stderr=True, echo_stdout=True, @@ -3178,13 +4035,15 @@ class S3Scrubber: log.warning(f"Scrub environment: {env}") log.warning(f"Output at: {output_path}") - raise RuntimeError("Remote storage scrub failed") + raise RuntimeError(f"Scrubber failed while running {args}") assert stdout is not None return stdout def scan_metadata(self) -> Any: - stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30) + stdout = self.scrubber_cli( + ["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30 + ) try: return json.loads(stdout) @@ -3193,11 +4052,42 @@ class S3Scrubber: log.error(stdout) raise + def tenant_snapshot(self, tenant_id: TenantId, output_path: Path): + stdout = self.scrubber_cli( + ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)], + timeout=30, + ) + log.info(f"tenant-snapshot output: {stdout}") -def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: - """Compute the working directory for an individual test.""" + def pageserver_physical_gc( + self, min_age_secs: int, tenant_ids: Optional[list[TenantId]] = None + ): + args = ["pageserver-physical-gc", "--min-age", f"{min_age_secs}s"] + + if tenant_ids is None: + tenant_ids = [] + + for tenant_id in tenant_ids: + args.extend(["--tenant-id", str(tenant_id)]) + + stdout = self.scrubber_cli( + args, + timeout=30, + ) + try: + return json.loads(stdout) + except: + log.error( + "Failed to decode JSON output from `pageserver-physical_gc`. Dumping stdout:" + ) + log.error(stdout) + raise + + +def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path: + """Compute the path to a working directory for an individual test.""" test_name = request.node.name - test_dir = top_output_dir / test_name.replace("/", "-") + test_dir = top_output_dir / f"{prefix}{test_name.replace('/', '-')}" # We rerun flaky tests multiple times, use a separate directory for each run. if (suffix := getattr(request.node, "execution_count", None)) is not None: @@ -3209,6 +4099,25 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return test_dir +def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + The working directory for a test. + """ + return _get_test_dir(request, top_output_dir, "") + + +def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + Directory that contains `upperdir` and `workdir` for overlayfs mounts + that a test creates. See `NeonEnvBuilder.overlay_mount`. + """ + return _get_test_dir(request, top_output_dir, "overlay-") + + +def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path: + return top_output_dir / "shared-snapshots" / snapshot_name + + def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: return get_test_output_dir(request, top_output_dir) / "repo" @@ -3223,7 +4132,7 @@ def pytest_addoption(parser: Parser): SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] - r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)" + r"config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)" ) @@ -3236,8 +4145,12 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] # scope. So it uses the get_test_output_dir() function to get the path, and # this fixture ensures that the directory exists. That works because # 'autouse' fixtures are run before other fixtures. +# +# NB: we request the overlay dir fixture so the fixture does its cleanups @pytest.fixture(scope="function", autouse=True) -def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[Path]: +def test_output_dir( + request: FixtureRequest, top_output_dir: Path, test_overlay_dir: Path +) -> Iterator[Path]: """Create the working directory for an individual test.""" # one directory per test @@ -3251,6 +4164,112 @@ def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[P allure_attach_from_dir(test_dir) +class FileAndThreadLock: + def __init__(self, path: Path): + self.path = path + self.thread_lock = threading.Lock() + self.fd: Optional[int] = None + + def __enter__(self): + self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY) + # lock thread lock before file lock so that there's no race + # around flocking / funlocking the file lock + self.thread_lock.acquire() + flock(self.fd, LOCK_EX) + + def __exit__(self, exc_type, exc_value, exc_traceback): + assert self.fd is not None + assert self.thread_lock.locked() # ... by us + flock(self.fd, LOCK_UN) + self.thread_lock.release() + os.close(self.fd) + self.fd = None + + +class SnapshotDirLocked: + def __init__(self, parent: SnapshotDir): + self._parent = parent + + def is_initialized(self): + # TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized. + # Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed. + return self._parent._marker_file_path.exists() + + def set_initialized(self): + self._parent._marker_file_path.write_text("") + + @property + def path(self) -> Path: + return self._parent._path / "snapshot" + + +class SnapshotDir: + _path: Path + + def __init__(self, path: Path): + self._path = path + assert self._path.is_dir() + self._lock = FileAndThreadLock(self._lock_file_path) + + @property + def _lock_file_path(self) -> Path: + return self._path / "initializing.flock" + + @property + def _marker_file_path(self) -> Path: + return self._path / "initialized.marker" + + def __enter__(self) -> SnapshotDirLocked: + self._lock.__enter__() + return SnapshotDirLocked(self) + + def __exit__(self, exc_type, exc_value, exc_traceback): + self._lock.__exit__(exc_type, exc_value, exc_traceback) + + +def shared_snapshot_dir(top_output_dir, ident: str) -> SnapshotDir: + snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident) + snapshot_dir_path.mkdir(exist_ok=True, parents=True) + return SnapshotDir(snapshot_dir_path) + + +@pytest.fixture(scope="function") +def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: + """ + Idempotently create a test's overlayfs mount state directory. + If the functionality isn't enabled via env var, returns None. + + The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc). + """ + + if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None: + return None + + overlay_dir = get_test_overlay_dir(request, top_output_dir) + log.info(f"test_overlay_dir is {overlay_dir}") + + overlay_dir.mkdir(exist_ok=True) + # unmount stale overlayfs mounts which subdirectories of `overlay_dir/*` as the overlayfs `upperdir` and `workdir` + for mountpoint in overlayfs.iter_mounts_beneath(get_test_output_dir(request, top_output_dir)): + cmd = ["sudo", "umount", str(mountpoint)] + log.info( + f"Unmounting stale overlayfs mount probably created during earlier test run: {cmd}" + ) + subprocess.run(cmd, capture_output=True, check=True) + # the overlayfs `workdir`` is owned by `root`, shutil.rmtree won't work. + cmd = ["sudo", "rm", "-rf", str(overlay_dir)] + subprocess.run(cmd, capture_output=True, check=True) + + overlay_dir.mkdir() + + return overlay_dir + + # no need to clean up anything: on clean shutdown, + # NeonEnvBuilder.overlay_cleanup_teardown takes care of cleanup + # and on unclean shutdown, this function will take care of it + # on the next test run + + SKIP_DIRS = frozenset( ( "pg_wal", @@ -3320,33 +4339,34 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]: # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content( - test_output_dir: Path, env: NeonEnv, endpoint: Endpoint, pageserver_id: Optional[int] = None + test_output_dir: Path, + env: NeonEnv, + endpoint: Endpoint, + ignored_files: Optional[list[str]] = None, ): + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) + # Get the timeline ID. We need it for the 'basebackup' command timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0]) - # many tests already checkpoint, but do it just in case - with closing(endpoint.connect()) as conn: - with conn.cursor() as cur: - cur.execute("CHECKPOINT") - - # wait for pageserver to catch up - wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id) # stop postgres to ensure that files won't change endpoint.stop() + # Read the shutdown checkpoint's LSN + checkpoint_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(endpoint.pg_data_dir_path()) + # Take a basebackup from pageserver restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir" restored_dir_path.mkdir(exist_ok=True) - pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) psql_path = os.path.join(pg_bin.pg_bin_path, "psql") + pageserver_id = env.storage_controller.locate(endpoint.tenant_id)[0]["node_id"] cmd = rf""" {psql_path} \ --no-psqlrc \ postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg} \ - -c 'basebackup {endpoint.tenant_id} {timeline_id}' \ + -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}' \ | tar -x -C {restored_dir_path} """ @@ -3365,8 +4385,21 @@ def check_restored_datadir_content( # list files we're going to compare assert endpoint.pgdata_dir pgdata_files = list_files_to_compare(Path(endpoint.pgdata_dir)) + restored_files = list_files_to_compare(restored_dir_path) + if pgdata_files != restored_files: + # filter pg_xact and multixact files which are downloaded on demand + pgdata_files = [ + f + for f in pgdata_files + if not f.startswith("pg_xact") and not f.startswith("pg_multixact") + ] + + if ignored_files: + pgdata_files = [f for f in pgdata_files if f not in ignored_files] + restored_files = [f for f in restored_files if f not in ignored_files] + # check that file sets are equal assert pgdata_files == restored_files @@ -3382,13 +4415,13 @@ def check_restored_datadir_content( for f in mismatch: f1 = os.path.join(endpoint.pgdata_dir, f) f2 = os.path.join(restored_dir_path, f) - stdout_filename = "{}.filediff".format(f2) + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd -b {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd -b {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, error) == ([], []) @@ -3410,19 +4443,135 @@ def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) - time.sleep(0.5) +def tenant_get_shards( + env: NeonEnv, tenant_id: TenantId, pageserver_id: Optional[int] = None +) -> list[tuple[TenantShardId, NeonPageserver]]: + """ + Helper for when you want to talk to one or more pageservers, and the + caller _might_ have specified a pageserver, or they might leave it to + us to figure out the shards for a tenant. + + If the caller provides `pageserver_id`, it will be used for all shards, even + if the shard is indicated by storage controller to be on some other pageserver. + + Caller should over the response to apply their per-pageserver action to + each shard + """ + if pageserver_id is not None: + override_pageserver = [p for p in env.pageservers if p.id == pageserver_id][0] + else: + override_pageserver = None + + if len(env.pageservers) > 1: + return [ + ( + TenantShardId.parse(s["shard_id"]), + override_pageserver or env.get_pageserver(s["node_id"]), + ) + for s in env.storage_controller.locate(tenant_id) + ] + else: + # Assume an unsharded tenant + return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)] + + +def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint): + primary_lsn = Lsn( + primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False) + ) + while True: + secondary_lsn = Lsn( + secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False) + ) + caught_up = secondary_lsn >= primary_lsn + log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}") + if caught_up: + return + time.sleep(1) + + +def log_replica_lag(primary: Endpoint, secondary: Endpoint): + last_replay_lsn = Lsn( + secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False) + ) + primary_lsn = Lsn( + primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False) + ) + lag = primary_lsn - last_replay_lsn + log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}") + + def wait_for_last_flush_lsn( env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId, pageserver_id: Optional[int] = None, + auth_token: Optional[str] = None, ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" + shards = tenant_get_shards(env, tenant, pageserver_id) + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) - return wait_for_last_record_lsn( - env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn - ) + + results = [] + for tenant_shard_id, pageserver in shards: + log.info( + f"wait_for_last_flush_lsn: waiting for {last_flush_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})" + ) + waited = wait_for_last_record_lsn( + pageserver.http_client(auth_token=auth_token), tenant_shard_id, timeline, last_flush_lsn + ) + + assert waited >= last_flush_lsn + results.append(waited) + + # Return the lowest LSN that has been ingested by all shards + return min(results) + + +def flush_ep_to_pageserver( + env: NeonEnv, + ep: Endpoint, + tenant: TenantId, + timeline: TimelineId, + pageserver_id: Optional[int] = None, +) -> Lsn: + """ + Stop endpoint and wait until all committed WAL reaches the pageserver + (last_record_lsn). This is for use by tests which want everything written so + far to reach pageserver *and* expecting that no more data will arrive until + endpoint starts again, so unlike wait_for_last_flush_lsn it polls + safekeepers instead of compute to learn LSN. + + Returns the catch up LSN. + """ + ep.stop() + + commit_lsn: Lsn = Lsn(0) + # In principle in the absense of failures polling single sk would be enough. + for sk in env.safekeepers: + cli = sk.http_client() + # wait until compute connections are gone + wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline)) + commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn) + + # Note: depending on WAL filtering implementation, probably most shards + # won't be able to reach commit_lsn (unless gaps are also ack'ed), so this + # is broken in sharded case. + shards = tenant_get_shards(env, tenant, pageserver_id) + for tenant_shard_id, pageserver in shards: + log.info( + f"flush_ep_to_pageserver: waiting for {commit_lsn} on shard {tenant_shard_id} on pageserver {pageserver.id})" + ) + waited = wait_for_last_record_lsn( + pageserver.http_client(), tenant_shard_id, timeline, commit_lsn + ) + + assert waited >= commit_lsn + + return commit_lsn def wait_for_wal_insert_lsn( @@ -3434,9 +4583,16 @@ def wait_for_wal_insert_lsn( ) -> Lsn: """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn.""" last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0]) - return wait_for_last_record_lsn( - env.get_pageserver(pageserver_id).http_client(), tenant, timeline, last_flush_lsn - ) + result = None + for tenant_shard_id, pageserver in tenant_get_shards(env, tenant, pageserver_id): + shard_r = wait_for_last_record_lsn( + pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn + ) + if result is None: + result = shard_r + + assert result is not None + return result def fork_at_current_lsn( @@ -3461,6 +4617,7 @@ def last_flush_lsn_upload( tenant_id: TenantId, timeline_id: TimelineId, pageserver_id: Optional[int] = None, + auth_token: Optional[str] = None, ) -> Lsn: """ Wait for pageserver to catch to the latest flush LSN of given endpoint, @@ -3468,13 +4625,15 @@ def last_flush_lsn_upload( reaching flush LSN). """ last_flush_lsn = wait_for_last_flush_lsn( - env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver_id, auth_token=auth_token ) - ps_http = env.get_pageserver(pageserver_id).http_client() - wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, last_flush_lsn) - # force a checkpoint to trigger upload - ps_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + shards = tenant_get_shards(env, tenant_id, pageserver_id) + for tenant_shard_id, pageserver in shards: + ps_http = pageserver.http_client(auth_token=auth_token) + wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, last_flush_lsn) + # force a checkpoint to trigger upload + ps_http.timeline_checkpoint(tenant_shard_id, timeline_id) + wait_for_upload(ps_http, tenant_shard_id, timeline_id, last_flush_lsn) return last_flush_lsn @@ -3489,3 +4648,79 @@ def parse_project_git_version_output(s: str) -> str: return commit raise ValueError(f"unable to parse --version output: '{s}'") + + +def generate_uploads_and_deletions( + env: NeonEnv, + *, + init: bool = True, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + data: Optional[str] = None, + pageserver: NeonPageserver, +): + """ + Using the environment's default tenant + timeline, generate a load pattern + that results in some uploads and some deletions to remote storage. + """ + + if tenant_id is None: + tenant_id = env.initial_tenant + assert tenant_id is not None + + if timeline_id is None: + timeline_id = env.initial_timeline + assert timeline_id is not None + + ps_http = pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=tenant_id, pageserver_id=pageserver.id + ) as endpoint: + if init: + endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + def churn(data): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, 200) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) + assert tenant_id is not None + assert timeline_id is not None + # We are waiting for uploads as well as local flush, in order to avoid leaving the system + # in a state where there are "future layers" in remote storage that will generate deletions + # after a restart. + last_flush_lsn_upload( + env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id + ) + + # Compaction should generate some GC-elegible layers + for i in range(0, 2): + churn(f"{i if data is None else data}") + + gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) + print_gc_result(gc_result) + assert gc_result["layers_removed"] > 0 + + # Stop endpoint and flush all data to pageserver, then checkpoint it: this + # ensures that the pageserver is in a fully idle state: there will be no more + # background ingest, no more uploads pending, and therefore no non-determinism + # in subsequent actions like pageserver restarts. + final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + # Finish uploads + wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) + # Finish all remote writes (including deletions) + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) diff --git a/test_runner/fixtures/overlayfs.py b/test_runner/fixtures/overlayfs.py new file mode 100644 index 0000000000..3e2f661893 --- /dev/null +++ b/test_runner/fixtures/overlayfs.py @@ -0,0 +1,16 @@ +from pathlib import Path +from typing import Iterator + +import psutil + + +def iter_mounts_beneath(topdir: Path) -> Iterator[Path]: + """ + Iterate over the overlayfs mounts beneath the specififed `topdir`. + The `topdir` itself isn't considered. + """ + for part in psutil.disk_partitions(all=True): + if part.fstype == "overlay": + mountpoint = Path(part.mountpoint) + if topdir in mountpoint.parents: + yield mountpoint diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 74c6bddf23..147d5705d3 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -55,7 +55,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # FIXME: These need investigation ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", - ".*Removing intermediate uninit mark file.*", # Tenant::delete_timeline() can cause any of the four following errors. # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946 ".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed @@ -67,10 +66,11 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress ".*task iteration took longer than the configured period.*", # these can happen anytime we do compactions from background task and shutdown pageserver - r".*ERROR.*ancestor timeline \S+ is being stopped", + ".*could not compact.*cancelled.*", # this is expected given our collaborative shutdown approach for the UploadQueue ".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*", ".*Compaction failed.*, retrying in .*: ShuttingDown", + ".*Compaction failed.*, retrying in .*: Other\\(timeline shutting down.*", # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally ".*Error processing HTTP request: NotFound: Timeline .* was not found", ".*took more than expected to complete.*", @@ -82,9 +82,33 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( # During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this # up is tracked in https://github.com/neondatabase/neon/issues/6096 ".*Cancelled, shutting down.*", + # Open layers are only rolled at Lsn boundaries to avoid name clashses. + # Hence, we can overshoot the soft limit set by checkpoint distance. + # This is especially pronounced in tests that set small checkpoint + # distances. + ".*Flushed oversized open layer with size.*", + # During teardown, we stop the storage controller before the pageservers, so pageservers + # can experience connection errors doing background deletion queue work. + ".*WARN deletion backend: calling control plane generation validation API failed.*error sending request.*", + # Can happen when the test shuts down the storage controller while it is calling the utilization API + ".*WARN.*path=/v1/utilization .*request was dropped before completing", + # Can happen during shutdown + ".*scheduling deletion on drop failed: queue is in state Stopped.*", ) +DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ + # Many tests will take pageservers offline, resulting in log warnings on the controller + # failing to connect to them. + ".*Call to node.*management API.*failed.*receive body.*", + ".*Call to node.*management API.*failed.*ReceiveBody.*", + # Many tests will start up with a node offline + ".*startup_reconcile: Could not scan node.*", + # Tests run in dev mode + ".*Starting in dev mode.*", +] + + def _check_allowed_errors(input): allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS) @@ -110,9 +134,10 @@ if __name__ == "__main__": "-i", "--input", type=argparse.FileType("r"), - default=sys.stdin, - help="Pageserver logs file. Reads from stdin if no file is provided.", + help="Pageserver logs file. Use '-' for stdin.", + required=True, ) + args = parser.parse_args() errors = _check_allowed_errors(args.input) diff --git a/test_runner/fixtures/pageserver/types.py b/test_runner/fixtures/pageserver/common_types.py similarity index 53% rename from test_runner/fixtures/pageserver/types.py rename to test_runner/fixtures/pageserver/common_types.py index b3c1174b35..a6c327a8a0 100644 --- a/test_runner/fixtures/pageserver/types.py +++ b/test_runner/fixtures/pageserver/common_types.py @@ -1,7 +1,8 @@ +import re from dataclasses import dataclass from typing import Any, Dict, Tuple, Union -from fixtures.types import KEY_MAX, KEY_MIN, Key, Lsn +from fixtures.common_types import KEY_MAX, KEY_MIN, Key, Lsn @dataclass @@ -11,7 +12,7 @@ class IndexLayerMetadata: @dataclass(frozen=True) -class ImageLayerFileName: +class ImageLayerName: lsn: Lsn key_start: Key key_end: Key @@ -25,80 +26,72 @@ class ImageLayerFileName: @dataclass(frozen=True) -class DeltaLayerFileName: +class DeltaLayerName: lsn_start: Lsn lsn_end: Lsn key_start: Key key_end: Key - def is_l0(self): + def is_l0(self) -> bool: return self.key_start == KEY_MIN and self.key_end == KEY_MAX - def to_str(self): + def to_str(self) -> str: ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}" assert self == parse_layer_file_name(ret) return ret -LayerFileName = Union[ImageLayerFileName, DeltaLayerFileName] +LayerName = Union[ImageLayerName, DeltaLayerName] class InvalidFileName(Exception): pass +IMAGE_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) + + def parse_image_layer(f_name: str) -> Tuple[int, int, int]: """Parse an image layer file name. Return key start, key end, and snapshot lsn""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - try: - return int(key_parts[0], 16), int(key_parts[1], 16), int(parts[1], 16) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + + match = IMAGE_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an image layer filename") + + return int(match.group(1), 16), int(match.group(2), 16), int(match.group(3), 16) + + +DELTA_LAYER_FILE_NAME = re.compile( + "^([A-F0-9]{36})-([A-F0-9]{36})__([A-F0-9]{16})-([A-F0-9]{16})(-v1-[a-f0-9]{8})?$" +) def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]: """Parse a delta layer file name. Return key start, key end, lsn start, and lsn end""" - parts = f_name.split("__") - if len(parts) != 2: - raise InvalidFileName(f"expecting two parts separated by '__', got: {parts}") - key_parts = parts[0].split("-") - if len(key_parts) != 2: - raise InvalidFileName( - f"expecting two key parts separated by '--' in parts[0], got: {key_parts}" - ) - lsn_parts = parts[1].split("-") - if len(lsn_parts) != 2: - raise InvalidFileName( - f"expecting two lsn parts separated by '--' in parts[1], got: {lsn_parts}" - ) - try: - return ( - int(key_parts[0], 16), - int(key_parts[1], 16), - int(lsn_parts[0], 16), - int(lsn_parts[1], 16), - ) - except ValueError as e: - raise InvalidFileName(f"conversion error: {f_name}") from e + match = DELTA_LAYER_FILE_NAME.match(f_name) + if match is None: + raise InvalidFileName(f"'{f_name}' is not an delta layer filename") + + return ( + int(match.group(1), 16), + int(match.group(2), 16), + int(match.group(3), 16), + int(match.group(4), 16), + ) -def parse_layer_file_name(file_name: str) -> LayerFileName: +def parse_layer_file_name(file_name: str) -> LayerName: try: key_start, key_end, lsn = parse_image_layer(file_name) - return ImageLayerFileName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) + return ImageLayerName(lsn=Lsn(lsn), key_start=Key(key_start), key_end=Key(key_end)) except InvalidFileName: pass try: key_start, key_end, lsn_start, lsn_end = parse_delta_layer(file_name) - return DeltaLayerFileName( + return DeltaLayerName( lsn_start=Lsn(lsn_start), lsn_end=Lsn(lsn_end), key_start=Key(key_start), @@ -107,21 +100,18 @@ def parse_layer_file_name(file_name: str) -> LayerFileName: except InvalidFileName: pass - raise ValueError() + raise InvalidFileName("neither image nor delta layer") -def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): +def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn): """ Determines if this layer file is considered to be in future meaning we will discard these layers during timeline initialization from the given disk_consistent_lsn. """ - if ( - isinstance(layer_file_name, ImageLayerFileName) - and layer_file_name.lsn > disk_consistent_lsn - ): + if isinstance(layer_file_name, ImageLayerName) and layer_file_name.lsn > disk_consistent_lsn: return True elif ( - isinstance(layer_file_name, DeltaLayerFileName) + isinstance(layer_file_name, DeltaLayerName) and layer_file_name.lsn_end > disk_consistent_lsn + 1 ): return True @@ -131,7 +121,7 @@ def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn): @dataclass class IndexPartDump: - layer_metadata: Dict[LayerFileName, IndexLayerMetadata] + layer_metadata: Dict[LayerName, IndexLayerMetadata] disk_consistent_lsn: Lsn @classmethod diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index a779dcc436..d5441bd694 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -4,22 +4,24 @@ import json import time from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Set, Tuple +from datetime import datetime +from typing import Any, Dict, List, Optional, Set, Tuple, Union import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log -from fixtures.metrics import Metrics, parse_metrics +from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.pg_version import PgVersion -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import Fn class PageserverApiException(Exception): def __init__(self, message, status_code: int): super().__init__(message) + self.message = message self.status_code = status_code @@ -32,7 +34,7 @@ class TimelineCreate406(PageserverApiException): class TimelineCreate409(PageserverApiException): def __init__(self, res: requests.Response): assert res.status_code == 409 - super().__init__("", res.status_code) + super().__init__(res.json()["msg"], res.status_code) @dataclass @@ -54,20 +56,30 @@ class InMemoryLayerInfo: class HistoricLayerInfo: kind: str layer_file_name: str - layer_file_size: Optional[int] + layer_file_size: int lsn_start: str lsn_end: Optional[str] remote: bool + # None for image layers, true if pageserver thinks this is an L0 delta layer + l0: Optional[bool] @classmethod def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: + # instead of parsing the key range lets keep the definition of "L0" in pageserver + l0_ness = d.get("l0") + assert l0_ness is None or isinstance(l0_ness, bool) + + size = d["layer_file_size"] + assert isinstance(size, int) + return HistoricLayerInfo( kind=d["kind"], layer_file_name=d["layer_file_name"], - layer_file_size=d.get("layer_file_size"), + layer_file_size=size, lsn_start=d["lsn_start"], lsn_end=d.get("lsn_end"), remote=d["remote"], + l0=l0_ness, ) @@ -123,7 +135,7 @@ class TenantConfig: ) -class PageserverHttpClient(requests.Session): +class PageserverHttpClient(requests.Session, MetricsGetter): def __init__( self, port: int, @@ -211,7 +223,7 @@ class PageserverHttpClient(requests.Session): def tenant_create( self, - new_tenant_id: TenantId, + new_tenant_id: Union[TenantId, TenantShardId], conf: Optional[Dict[str, Any]] = None, generation: Optional[int] = None, ) -> TenantId: @@ -239,7 +251,7 @@ class PageserverHttpClient(requests.Session): def tenant_attach( self, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], config: None | Dict[str, Any] = None, config_null: bool = False, generation: Optional[int] = None, @@ -261,15 +273,21 @@ class PageserverHttpClient(requests.Session): ) self.verbose_error(res) - def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): + def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None): params = {} if detach_ignored: params["detach_ignored"] = "true" - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params) + kwargs = {} + if timeout_secs is not None: + kwargs["timeout"] = timeout_secs + + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs + ) self.verbose_error(res) - def tenant_reset(self, tenant_id: TenantId, drop_cache: bool): + def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool): params = {} if drop_cache: params["drop_cache"] = "true" @@ -278,23 +296,46 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) def tenant_location_conf( - self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None + self, + tenant_id: Union[TenantId, TenantShardId], + location_conf=dict[str, Any], + flush_ms=None, + lazy: Optional[bool] = None, ): body = location_conf.copy() - body["tenant_id"] = str(tenant_id) params = {} if flush_ms is not None: params["flush_ms"] = str(flush_ms) + if lazy is not None: + params["lazy"] = "true" if lazy else "false" + res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config", json=body, params=params, ) self.verbose_error(res) + return res.json() - def tenant_delete(self, tenant_id: TenantId): + def tenant_list_locations(self): + res = self.get( + f"http://localhost:{self.port}/v1/location_config", + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json["tenant_shards"], list) + return res_json + + def tenant_get_location(self, tenant_id: TenantShardId): + res = self.get( + f"http://localhost:{self.port}/v1/location_config/{tenant_id}", + ) + self.verbose_error(res) + return res.json() + + def tenant_delete(self, tenant_id: Union[TenantId, TenantShardId]): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) return res @@ -310,27 +351,46 @@ class PageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") self.verbose_error(res) - def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + def tenant_status( + self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False + ) -> Dict[Any, Any]: + """ + :activate: hint the server not to accelerate activation of this tenant in response + to this query. False by default for tests, because they generally want to observed the + system rather than interfering with it. This is true by default on the server side, + because in the field if the control plane is GET'ing a tenant it's a sign that it wants + to do something with it. + """ + params = {} + if not activate: + params["activate"] = "false" + + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}", params=params) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) return res_json - def tenant_config(self, tenant_id: TenantId) -> TenantConfig: + def tenant_config(self, tenant_id: Union[TenantId, TenantShardId]) -> TenantConfig: res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config") self.verbose_error(res) return TenantConfig.from_json(res.json()) - def tenant_heatmap_upload(self, tenant_id: TenantId): + def tenant_heatmap_upload(self, tenant_id: Union[TenantId, TenantShardId]): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload") self.verbose_error(res) - def tenant_secondary_download(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download") + def tenant_secondary_download( + self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None + ) -> tuple[int, dict[Any, Any]]: + url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download" + if wait_ms is not None: + url = url + f"?wait_ms={wait_ms}" + res = self.post(url) self.verbose_error(res) + return (res.status_code, res.json()) - def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): + def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]): assert "tenant_id" not in config.keys() res = self.put( f"http://localhost:{self.port}/v1/tenant/config", @@ -352,10 +412,12 @@ class PageserverHttpClient(requests.Session): del current[key] self.set_tenant_config(tenant_id, current) - def tenant_size(self, tenant_id: TenantId) -> int: + def tenant_size(self, tenant_id: Union[TenantId, TenantShardId]) -> int: return self.tenant_size_and_modelinputs(tenant_id)[0] - def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]: + def tenant_size_and_modelinputs( + self, tenant_id: Union[TenantId, TenantShardId] + ) -> Tuple[int, Dict[str, Any]]: """ Returns the tenant size, together with the model inputs as the second tuple item. """ @@ -370,7 +432,7 @@ class PageserverHttpClient(requests.Session): assert isinstance(inputs, dict) return (size, inputs) - def tenant_size_debug(self, tenant_id: TenantId) -> str: + def tenant_size_debug(self, tenant_id: Union[TenantId, TenantShardId]) -> str: """ Returns the tenant size debug info, as an HTML string """ @@ -380,9 +442,31 @@ class PageserverHttpClient(requests.Session): ) return res.text + def tenant_time_travel_remote_storage( + self, + tenant_id: Union[TenantId, TenantShardId], + timestamp: datetime, + done_if_after: datetime, + shard_counts: Optional[List[int]] = None, + ): + """ + Issues a request to perform time travel operations on the remote storage + """ + + if shard_counts is None: + shard_counts = [] + body: Dict[str, Any] = { + "shard_counts": shard_counts, + } + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/time_travel_remote_storage?travel_to={timestamp.isoformat()}Z&done_if_after={done_if_after.isoformat()}Z", + json=body, + ) + self.verbose_error(res) + def timeline_list( self, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, ) -> List[Dict[str, Any]]: @@ -403,7 +487,7 @@ class PageserverHttpClient(requests.Session): def timeline_create( self, pg_version: PgVersion, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], new_timeline_id: TimelineId, ancestor_timeline_id: Optional[TimelineId] = None, ancestor_start_lsn: Optional[Lsn] = None, @@ -437,10 +521,11 @@ class PageserverHttpClient(requests.Session): def timeline_detail( self, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, + force_await_initial_logical_size: bool = False, **kwargs, ) -> Dict[Any, Any]: params = {} @@ -448,6 +533,8 @@ class PageserverHttpClient(requests.Session): params["include-non-incremental-logical-size"] = "true" if include_timeline_dir_layer_file_size_sum: params["include-timeline-dir-layer-file-size-sum"] = "true" + if force_await_initial_logical_size: + params["force-await-initial-logical-size"] = "true" res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", @@ -459,7 +546,9 @@ class PageserverHttpClient(requests.Session): assert isinstance(res_json, dict) return res_json - def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs): + def timeline_delete( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, **kwargs + ): """ Note that deletion is not instant, it is scheduled and performed mostly in the background. So if you need to wait for it to complete use `timeline_delete_wait_completed`. @@ -473,7 +562,10 @@ class PageserverHttpClient(requests.Session): assert res_json is None def timeline_gc( - self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + gc_horizon: Optional[int], ) -> dict[str, Any]: """ Unlike most handlers, this will wait for the layers to be actually @@ -496,12 +588,21 @@ class PageserverHttpClient(requests.Session): return res_json def timeline_compact( - self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + force_repartition=False, + force_image_layer_creation=False, + wait_until_uploaded=False, ): self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" + if force_image_layer_creation: + query["force_image_layer_creation"] = "true" + if wait_until_uploaded: + query["wait_until_uploaded"] = "true" log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -513,28 +614,38 @@ class PageserverHttpClient(requests.Session): res_json = res.json() assert res_json is None + def timeline_preserve_initdb_archive( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + log.info( + f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}" + ) + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive", + ) + self.verbose_error(res) + def timeline_get_lsn_by_timestamp( self, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, - timestamp, - version: Optional[int] = None, + timestamp: datetime, + **kwargs, ): log.info( f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" ) - if version is None: - version_str = "" - else: - version_str = f"&version={version}" res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}", + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp.isoformat()}Z", + **kwargs, ) self.verbose_error(res) res_json = res.json() return res_json - def timeline_get_timestamp_of_lsn(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn): + def timeline_get_timestamp_of_lsn( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + ): log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}") res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn?lsn={lsn}", @@ -543,13 +654,33 @@ class PageserverHttpClient(requests.Session): res_json = res.json() return res_json + def timeline_layer_map_info( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): + log.info(f"Requesting layer map info of tenant {tenant_id}, timeline {timeline_id}") + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_checkpoint( - self, tenant_id: TenantId, timeline_id: TimelineId, force_repartition=False + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + force_repartition=False, + force_image_layer_creation=False, + wait_until_uploaded=False, ): self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" + if force_image_layer_creation: + query["force_image_layer_creation"] = "true" + if wait_until_uploaded: + query["wait_until_uploaded"] = "true" log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -563,7 +694,7 @@ class PageserverHttpClient(requests.Session): def timeline_spawn_download_remote_layers( self, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, max_concurrent_downloads: int, ) -> dict[str, Any]: @@ -582,7 +713,7 @@ class PageserverHttpClient(requests.Session): def timeline_poll_download_remote_layers_status( self, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, spawn_response: dict[str, Any], poll_state=None, @@ -604,7 +735,7 @@ class PageserverHttpClient(requests.Session): def timeline_download_remote_layers( self, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, max_concurrent_downloads: int, errors_ok=False, @@ -648,47 +779,37 @@ class PageserverHttpClient(requests.Session): }, ).value - def get_remote_timeline_client_metric( + def get_remote_timeline_client_queue_count( self, - metric_name: str, tenant_id: TenantId, timeline_id: TimelineId, file_kind: str, op_kind: str, - ) -> Optional[float]: - metrics = self.get_metrics() - matches = metrics.query_all( - name=metric_name, + ) -> Optional[int]: + metrics = [ + "pageserver_remote_timeline_client_calls_started_total", + "pageserver_remote_timeline_client_calls_finished_total", + ] + res = self.get_metrics_values( + metrics, filter={ "tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "file_kind": str(file_kind), "op_kind": str(op_kind), }, + absence_ok=True, ) - if len(matches) == 0: - value = None - elif len(matches) == 1: - value = matches[0].value - assert value is not None - else: - assert len(matches) < 2, "above filter should uniquely identify metric" - return value - - def get_metric_value( - self, name: str, filter: Optional[Dict[str, str]] = None - ) -> Optional[float]: - metrics = self.get_metrics() - results = metrics.query_all(name, filter=filter) - if not results: - log.info(f'could not find metric "{name}"') + if len(res) != 2: return None - assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" - return results[0].value + inc, dec = [res[metric] for metric in metrics] + queue_count = int(inc) - int(dec) + assert queue_count >= 0 + return queue_count def layer_map_info( self, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, ) -> LayerMapInfo: res = self.get( @@ -697,7 +818,9 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) return LayerMapInfo.from_json(res.json()) - def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): + def download_layer( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + ): res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", ) @@ -705,14 +828,35 @@ class PageserverHttpClient(requests.Session): assert res.status_code == 200 - def download_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): + def download_all_layers( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId + ): info = self.layer_map_info(tenant_id, timeline_id) for layer in info.historic_layers: if not layer.remote: continue self.download_layer(tenant_id, timeline_id, layer.layer_file_name) - def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): + def detach_ancestor( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + batch_size: int | None = None, + ) -> Set[TimelineId]: + params = {} + if batch_size is not None: + params["batch_size"] = batch_size + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", + params=params, + ) + self.verbose_error(res) + json = res.json() + return set(map(TimelineId, json["reparented_timelines"])) + + def evict_layer( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str + ): res = self.delete( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", ) @@ -720,7 +864,7 @@ class PageserverHttpClient(requests.Session): assert res.status_code in (200, 304) - def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): + def evict_all_layers(self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId): info = self.layer_map_info(tenant_id, timeline_id) for layer in info.historic_layers: self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) @@ -733,7 +877,7 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) return res.json() - def tenant_break(self, tenant_id: TenantId): + def tenant_break(self, tenant_id: Union[TenantId, TenantShardId]): res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") self.verbose_error(res) @@ -751,3 +895,46 @@ class PageserverHttpClient(requests.Session): self.put( f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}" ).raise_for_status() + + def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int: + detail = self.timeline_detail( + tenant_id, + timeline_id, + include_non_incremental_logical_size=True, + force_await_initial_logical_size=True, + ) + current_logical_size = detail["current_logical_size"] + non_incremental = detail["current_logical_size_non_incremental"] + assert current_logical_size == non_incremental + assert isinstance(current_logical_size, int) + return current_logical_size + + def top_tenants( + self, order_by: str, limit: int, where_shards_lt: int, where_gt: int + ) -> dict[Any, Any]: + res = self.post( + f"http://localhost:{self.port}/v1/top_tenants", + json={ + "order_by": order_by, + "limit": limit, + "where_shards_lt": where_shards_lt, + "where_gt": where_gt, + }, + ) + self.verbose_error(res) + return res.json() # type: ignore + + def perf_info( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + ): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting perf info: tenant {tenant_id}, timeline {timeline_id}") + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/perf_info", + ) + log.info(f"Got perf info response code: {res.status_code}") + self.verbose_error(res) + return res.json() diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py new file mode 100644 index 0000000000..8730d8ef75 --- /dev/null +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -0,0 +1,87 @@ +import concurrent.futures +import time +from typing import Any, Callable, Dict, Tuple + +import fixtures.pageserver.remote_storage +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, +) +from fixtures.pageserver.utils import ( + wait_until_tenant_state, +) +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind + + +def single_timeline( + neon_env_builder: NeonEnvBuilder, + setup_template: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]], + ncopies: int, +) -> NeonEnv: + """ + Create `ncopies` duplicates of a template tenant that has a single timeline. + """ + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + env = neon_env_builder.init_start() + + remote_storage = env.pageserver_remote_storage + assert isinstance(remote_storage, LocalFsStorage) + + ps_http = env.pageserver.http_client() + # clean up the useless default tenant + ps_http.tenant_delete(env.initial_tenant) + + log.info("invoking callback to create template tenant") + template_tenant, template_timeline, template_config = setup_template(env) + log.info( + f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}" + ) + + log.info("detach template tenant form pageserver") + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely + ".*Dropped remote consistent LSN updates.*", + ) + + log.info(f"duplicating template tenant {ncopies} times in S3") + tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) + + log.info("attach duplicated tenants to pageserver") + # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done. + # However, on-demand downloads are quite slow ATM. + # => do the on-demand downloads in Python. + assert ps_http.tenant_list() == [] + # make the attach fail after it created enough on-disk state to retry loading + # the tenant next startup, but before it can start background loops that would start download + ps_http.configure_failpoints(("attach-before-activate", "return")) + env.pageserver.allowed_errors.append( + ".*attach failed, setting tenant state to Broken: attach-before-activate.*" + ) + + def attach_broken(tenant): + env.pageserver.tenant_attach( + tenant, + config=template_config.copy(), + generation=100, + override_storage_controller_generation=True, + ) + time.sleep(0.1) + wait_until_tenant_state(ps_http, tenant, "Broken", 10) + + with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: + executor.map(attach_broken, tenants) + + env.pageserver.stop( + immediate=True + ) # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout + tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants)) + log.info("python-side on-demand download the layer files into local tenant dir") + fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir( + env, tenant_timelines + ) + + return env diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py new file mode 100644 index 0000000000..0c3612716a --- /dev/null +++ b/test_runner/fixtures/pageserver/remote_storage.py @@ -0,0 +1,116 @@ +import concurrent.futures +import os +import queue +import shutil +import threading +from pathlib import Path +from typing import Any, List, Tuple + +from fixtures.common_types import TenantId, TimelineId +from fixtures.neon_fixtures import NeonEnv, Pagectl +from fixtures.pageserver.common_types import ( + InvalidFileName, + parse_layer_file_name, +) +from fixtures.remote_storage import LocalFsStorage + + +def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId): + remote_storage = env.pageserver_remote_storage + assert isinstance(remote_storage, LocalFsStorage) + + src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines" + assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory" + + assert isinstance(remote_storage, LocalFsStorage) + dst_timelines_dir: Path = remote_storage.tenant_path(new_tenant) / "timelines" + dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False) + dst_timelines_dir.mkdir(parents=False, exist_ok=False) + + for tl in src_timelines_dir.iterdir(): + src_tl_dir = src_timelines_dir / tl.name + assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory" + dst_tl_dir = dst_timelines_dir / tl.name + dst_tl_dir.mkdir(parents=False, exist_ok=False) + for file in tl.iterdir(): + shutil.copy2(file, dst_tl_dir) + if "__" in file.name: + Pagectl(env).raw_cli( + [ + "layer", + "rewrite-summary", + str(dst_tl_dir / file.name), + "--new-tenant-id", + str(new_tenant), + ] + ) + else: + # index_part etc need no patching + pass + return None + + +def duplicate_tenant(env: NeonEnv, template_tenant: TenantId, ncopies: int) -> List[TenantId]: + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + def work(tenant_id): + duplicate_one_tenant(env, template_tenant, tenant_id) + + new_tenants: List[TenantId] = [TenantId.generate() for _ in range(0, ncopies)] + with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: + executor.map(work, new_tenants) + return new_tenants + + +def local_layer_name_from_remote_name(remote_name: str) -> str: + try: + return parse_layer_file_name(remote_name).to_str() + except InvalidFileName as e: + comps = remote_name.rsplit("-", 1) + if len(comps) == 1: + raise InvalidFileName("no generation suffix found") from e + else: + assert len(comps) == 2 + layer_file_name, _generation = comps + try: + return parse_layer_file_name(layer_file_name).to_str() + except InvalidFileName: + raise + + +def copy_all_remote_layer_files_to_local_tenant_dir( + env: NeonEnv, tenant_timelines: List[Tuple[TenantId, TimelineId]] +): + remote_storage = env.pageserver_remote_storage + assert isinstance(remote_storage, LocalFsStorage) + work: queue.Queue[Any] = queue.Queue() + for tenant, timeline in tenant_timelines: + remote_timeline_path = remote_storage.timeline_path(tenant, timeline) + local_timeline_path = env.pageserver.timeline_dir(tenant, timeline) + local_timeline_path.mkdir(parents=True, exist_ok=True) + downloads = {} + for remote_layer in remote_timeline_path.glob("*__*"): + local_name = local_layer_name_from_remote_name(remote_layer.name) + assert local_name not in downloads, "remote storage must have had split brain" + downloads[local_name] = remote_layer + for local_name, remote_path in downloads.items(): + work.put((remote_path, local_timeline_path / local_name)) + + def copy_layer_worker(queue): + while True: + item = queue.get() + if item is None: + return + remote_path, local_path = item + # not copy2, so it looks like a recent download, in case that's relevant to e.g. eviction + shutil.copy(remote_path, local_path, follow_symlinks=False) + + workers = [] + n_threads = os.cpu_count() or 1 + for _ in range(0, n_threads): + w = threading.Thread(target=copy_layer_worker, args=[work]) + workers.append(w) + w.start() + work.put(None) + for w in workers: + w.join() diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index e7b78cfb9a..72384c138b 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -1,12 +1,17 @@ import time -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple, Union -from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef +from mypy_boto3_s3.type_defs import ( + DeleteObjectOutputTypeDef, + EmptyResponseMetadataTypeDef, + ListObjectsV2OutputTypeDef, + ObjectTypeDef, +) +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient -from fixtures.remote_storage import RemoteStorageKind, S3Storage -from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage from fixtures.utils import wait_until @@ -15,14 +20,16 @@ def assert_tenant_state( tenant: TenantId, expected_state: str, message: Optional[str] = None, -): +) -> None: tenant_status = pageserver_http.tenant_status(tenant) log.info(f"tenant_status: {tenant_status}") assert tenant_status["state"]["slug"] == expected_state, message or tenant_status def remote_consistent_lsn( - pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId + pageserver_http: PageserverHttpClient, + tenant: Union[TenantId, TenantShardId], + timeline: TimelineId, ) -> Lsn: detail = pageserver_http.timeline_detail(tenant, timeline) @@ -39,7 +46,7 @@ def remote_consistent_lsn( def wait_for_upload( pageserver_http: PageserverHttpClient, - tenant: TenantId, + tenant: Union[TenantId, TenantShardId], timeline: TimelineId, lsn: Lsn, ): @@ -55,12 +62,18 @@ def wait_for_upload( ) time.sleep(1) raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn, current_lsn - ) + f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}" ) +def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str): + if tenant_info["state"]["slug"] == expected_state: + return True + if tenant_info["state"]["slug"] == "Broken": + raise RuntimeError(f"tenant became Broken, not {expected_state}") + return False + + def wait_until_tenant_state( pageserver_http: PageserverHttpClient, tenant_id: TenantId, @@ -78,10 +91,8 @@ def wait_until_tenant_state( log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") else: log.debug(f"Tenant {tenant_id} data: {tenant}") - if tenant["state"]["slug"] == expected_state: + if _tenant_in_expected_state(tenant, expected_state): return tenant - if tenant["state"]["slug"] == "Broken": - raise RuntimeError(f"tenant became Broken, not {expected_state}") time.sleep(period) @@ -90,9 +101,37 @@ def wait_until_tenant_state( ) +def wait_until_all_tenants_state( + pageserver_http: PageserverHttpClient, + expected_state: str, + iterations: int, + period: float = 1.0, + http_error_ok: bool = True, +): + """ + Like wait_until_tenant_state, but checks all tenants. + """ + for _ in range(iterations): + try: + tenants = pageserver_http.tenant_list() + except Exception as e: + if http_error_ok: + log.debug(f"Failed to list tenants: {e}") + else: + raise + else: + if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)): + return + time.sleep(period) + + raise Exception( + f"Not all tenants became active {expected_state} within {iterations * period} seconds" + ) + + def wait_until_timeline_state( pageserver_http: PageserverHttpClient, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, expected_state: str, iterations: int, @@ -141,7 +180,9 @@ def wait_until_tenant_active( def last_record_lsn( - pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId + pageserver_http_client: PageserverHttpClient, + tenant: Union[TenantId, TenantShardId], + timeline: TimelineId, ) -> Lsn: detail = pageserver_http_client.timeline_detail(tenant, timeline) @@ -152,7 +193,7 @@ def last_record_lsn( def wait_for_last_record_lsn( pageserver_http: PageserverHttpClient, - tenant: TenantId, + tenant: Union[TenantId, TenantShardId], timeline: TimelineId, lsn: Lsn, ) -> Lsn: @@ -163,38 +204,61 @@ def wait_for_last_record_lsn( return current_lsn if i % 10 == 0: log.info( - "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 - ) + f"{tenant}/{timeline} waiting for last_record_lsn to reach {lsn}, now {current_lsn}, iteration {i + 1}" ) time.sleep(0.1) raise Exception( - "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) + f"timed out while waiting for last_record_lsn to reach {lsn}, was {current_lsn}" ) def wait_for_upload_queue_empty( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): + wait_period_secs = 0.2 while True: all_metrics = pageserver_http.get_metrics() - tl = all_metrics.query_all( - "pageserver_remote_timeline_client_calls_unfinished", + started = all_metrics.query_all( + "pageserver_remote_timeline_client_calls_started_total", { "tenant_id": str(tenant_id), "timeline_id": str(timeline_id), }, ) - assert len(tl) > 0 - log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}") - if all(m.value == 0 for m in tl): + finished = all_metrics.query_all( + "pageserver_remote_timeline_client_calls_finished_total", + { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ) + + # this is `started left join finished`; if match, subtracting start from finished, resulting in queue depth + remaining_labels = ["shard_id", "file_kind", "op_kind"] + tl: List[Tuple[Any, float]] = [] + for s in started: + found = False + for f in finished: + if all([s.labels[label] == f.labels[label] for label in remaining_labels]): + assert ( + not found + ), "duplicate match, remaining_labels don't uniquely identify sample" + tl.append((s.labels, int(s.value) - int(f.value))) + found = True + if not found: + tl.append((s.labels, int(s.value))) + assert len(tl) == len(started), "something broken with join logic" + log.info(f"upload queue for {tenant_id}/{timeline_id}:") + for labels, queue_count in tl: + log.info(f" {labels}: {queue_count}") + if all(queue_count == 0 for (_, queue_count) in tl): return - time.sleep(0.2) + time.sleep(wait_period_secs) def wait_timeline_detail_404( pageserver_http: PageserverHttpClient, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, iterations: int, interval: Optional[float] = None, @@ -219,33 +283,28 @@ def wait_timeline_detail_404( def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, - tenant_id: TenantId, + tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, iterations: int = 20, interval: Optional[float] = None, **delete_args, -): +) -> None: pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval) -if TYPE_CHECKING: - # TODO avoid by combining remote storage related stuff in single type - # and just passing in this type instead of whole builder - from fixtures.neon_fixtures import NeonEnvBuilder - - +# remote_storage must not be None, but that's easier for callers to make mypy happy def assert_prefix_empty( - neon_env_builder: "NeonEnvBuilder", + remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None, allowed_postfix: Optional[str] = None, -): - response = list_prefix(neon_env_builder, prefix) +) -> None: + assert remote_storage is not None + response = list_prefix(remote_storage, prefix) keys = response["KeyCount"] objects: List[ObjectTypeDef] = response.get("Contents", []) common_prefixes = response.get("CommonPrefixes", []) - remote_storage = neon_env_builder.pageserver_remote_storage is_mock_s3 = isinstance(remote_storage, S3Storage) and not remote_storage.cleanup if is_mock_s3: @@ -254,7 +313,7 @@ def assert_prefix_empty( # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865 # this seems like a mock_s3 issue log.warning( - f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0" + f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0" ) keys = 0 elif keys != 0 and len(objects) == 0: @@ -279,21 +338,21 @@ def assert_prefix_empty( ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}" -def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None): - response = list_prefix(neon_env_builder, prefix) +# remote_storage must not be None, but that's easier for callers to make mypy happy +def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None): + assert remote_storage is not None + response = list_prefix(remote_storage, prefix) assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}" def list_prefix( - neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None, delimiter: str = "/" + remote: RemoteStorage, prefix: Optional[str] = None, delimiter: str = "/" ) -> ListObjectsV2OutputTypeDef: """ Note that this function takes into account prefix_in_bucket. """ # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api. - remote = neon_env_builder.pageserver_remote_storage assert isinstance(remote, S3Storage), "localfs is currently not supported" - assert remote.client is not None prefix_in_bucket = remote.prefix_in_bucket or "" if not prefix: @@ -312,6 +371,65 @@ def list_prefix( return response +def remote_storage_delete_key( + remote: RemoteStorage, + key: str, +) -> DeleteObjectOutputTypeDef: + """ + Note that this function takes into account prefix_in_bucket. + """ + # For local_fs we need to use a different implementation. As we don't need local_fs, just don't support it for now. + assert isinstance(remote, S3Storage), "localfs is currently not supported" + + prefix_in_bucket = remote.prefix_in_bucket or "" + + # real s3 tests have uniqie per test prefix + # mock_s3 tests use special pageserver prefix for pageserver stuff + key = "/".join((prefix_in_bucket, key)) + + response = remote.client.delete_object( + Bucket=remote.bucket_name, + Key=key, + ) + return response + + +def enable_remote_storage_versioning( + remote: RemoteStorage, +) -> EmptyResponseMetadataTypeDef: + """ + Enable S3 versioning for the remote storage + """ + # local_fs has no support for versioning + assert isinstance(remote, S3Storage), "localfs is currently not supported" + + # The SDK supports enabling versioning on normal S3 as well but we don't want to change + # these settings from a test in a live bucket (also, our access isn't enough nor should it be) + assert not remote.real, "Enabling storage versioning only supported on Mock S3" + + # Workaround to enable self-copy until upstream bug is fixed: https://github.com/getmoto/moto/issues/7300 + remote.client.put_bucket_encryption( + Bucket=remote.bucket_name, + ServerSideEncryptionConfiguration={ + "Rules": [ + { + "ApplyServerSideEncryptionByDefault": {"SSEAlgorithm": "AES256"}, + "BucketKeyEnabled": False, + }, + ] + }, + ) + # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. + response = remote.client.put_bucket_versioning( + Bucket=remote.bucket_name, + VersioningConfiguration={ + "MFADelete": "Disabled", + "Status": "Enabled", + }, + ) + return response + + def wait_tenant_status_404( pageserver_http: PageserverHttpClient, tenant_id: TenantId, @@ -337,16 +455,32 @@ def tenant_delete_wait_completed( pageserver_http: PageserverHttpClient, tenant_id: TenantId, iterations: int, + ignore_errors: bool = False, ): - pageserver_http.tenant_delete(tenant_id=tenant_id) + if not ignore_errors: + pageserver_http.tenant_delete(tenant_id=tenant_id) + else: + interval = 0.5 + + def delete_request_sent(): + try: + pageserver_http.tenant_delete(tenant_id=tenant_id) + except PageserverApiException as e: + log.debug(e) + if e.status_code == 404: + return + except Exception as e: + log.debug(e) + + wait_until(iterations, interval=interval, func=delete_request_sent) wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations) MANY_SMALL_LAYERS_TENANT_CONFIG = { "gc_period": "0s", "compaction_period": "0s", - "checkpoint_distance": f"{1024**2}", - "image_creation_threshold": "100", + "checkpoint_distance": 1024**2, + "image_creation_threshold": 100, } diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 53350138dd..0227285822 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -1,50 +1,93 @@ import os -from typing import Optional +from typing import Any, Dict, Optional import pytest -from _pytest.fixtures import FixtureRequest +import toml from _pytest.python import Metafunc from fixtures.pg_version import PgVersion +from fixtures.utils import AuxFileStore """ -Dynamically parametrize tests by Postgres version and build type (debug/release/remote) +Dynamically parametrize tests by different parameters """ @pytest.fixture(scope="function", autouse=True) -def pg_version(request: FixtureRequest) -> Optional[PgVersion]: - # Do not parametrize performance tests yet, we need to prepare grafana charts first - if "test_runner/performance" in str(request.node.path): - v = os.environ.get("DEFAULT_PG_VERSION") - return PgVersion(v) - +def pg_version() -> Optional[PgVersion]: return None @pytest.fixture(scope="function", autouse=True) -def build_type(request: FixtureRequest) -> Optional[str]: - # Do not parametrize performance tests yet, we need to prepare grafana charts first - if "test_runner/performance" in str(request.node.path): - return os.environ.get("BUILD_TYPE", "").lower() - +def build_type() -> Optional[str]: return None +@pytest.fixture(scope="function", autouse=True) +def platform() -> Optional[str]: + return None + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_virtual_file_io_engine() -> Optional[str]: + return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_aux_file_policy() -> Optional[AuxFileStore]: + return None + + +def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") + if toml_table is None: + return None + v = toml.loads(toml_table) + assert isinstance(v, dict) + return v + + +@pytest.fixture(scope="function", autouse=True) +def pageserver_default_tenant_config_compaction_algorithm() -> Optional[Dict[str, Any]]: + return get_pageserver_default_tenant_config_compaction_algorithm() + + def pytest_generate_tests(metafunc: Metafunc): - # Do not parametrize performance tests yet, we need to prepare grafana charts first - if "test_runner/performance" in metafunc.definition._nodeid: - return - - if (v := os.environ.get("DEFAULT_PG_VERSION")) is None: - pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] - else: - pg_versions = [PgVersion(v)] - - if (bt := os.environ.get("BUILD_TYPE")) is None: + if (bt := os.getenv("BUILD_TYPE")) is None: build_types = ["debug", "release"] else: build_types = [bt.lower()] metafunc.parametrize("build_type", build_types) + + if (v := os.getenv("DEFAULT_PG_VERSION")) is None: + pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET] + else: + pg_versions = [PgVersion(v)] + metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions)) + + # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=std-fs` + # And do not change test name for default `pageserver_virtual_file_io_engine=tokio-epoll-uring` to keep tests statistics + if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ( + "", + "tokio-epoll-uring", + ): + metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine]) + + # Same hack for pageserver_default_tenant_config_compaction_algorithm + if ( + explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() + ) is not None: + metafunc.parametrize( + "pageserver_default_tenant_config_compaction_algorithm", + [explicit_default], + ids=[explicit_default["kind"]], + ) + + # For performance tests, parametrize also by platform + if ( + "test_runner/performance" in metafunc.definition._nodeid + and (platform := os.getenv("PLATFORM")) is not None + ): + metafunc.parametrize("platform", [platform.lower()]) diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py index 657718da00..941889a2f5 100644 --- a/test_runner/fixtures/pg_version.py +++ b/test_runner/fixtures/pg_version.py @@ -52,7 +52,7 @@ class PgVersion(str, enum.Enum): return None -DEFAULT_VERSION: PgVersion = PgVersion.V14 +DEFAULT_VERSION: PgVersion = PgVersion.V15 def skip_on_postgres(version: PgVersion, reason: str): @@ -78,6 +78,13 @@ def pytest_addoption(parser: Parser): ) +def run_only_on_default_postgres(reason: str): + return pytest.mark.skipif( + PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION, + reason=reason, + ) + + def pytest_configure(config: Config): if config.getoption("--pg-version"): raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead") diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index c0c2383feb..6f6526d3fc 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -12,8 +12,8 @@ import boto3 import toml from mypy_boto3_s3 import S3Client +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.types import TenantId, TimelineId TIMELINE_INDEX_PART_FILE_NAME = "index_part.json" TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json" @@ -50,7 +50,7 @@ class MockS3Server: # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux # if a process is started from the shell process. - self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"]) error = None try: return_code = self.subprocess.poll() @@ -141,11 +141,13 @@ class LocalFsStorage: with self.heatmap_path(tenant_id).open("r") as f: return json.load(f) - def to_toml_inline_table(self) -> str: - rv = { + def to_toml_dict(self) -> Dict[str, Any]: + return { "local_path": str(self.root), } - return toml.TomlEncoder().dump_inline_table(rv) + + def to_toml_inline_table(self) -> str: + return toml.TomlEncoder().dump_inline_table(self.to_toml_dict()) def cleanup(self): # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files @@ -160,20 +162,31 @@ class LocalFsStorage: class S3Storage: bucket_name: str bucket_region: str - access_key: str - secret_key: str + access_key: Optional[str] + secret_key: Optional[str] + aws_profile: Optional[str] prefix_in_bucket: str client: S3Client cleanup: bool """Is this MOCK_S3 (false) or REAL_S3 (true)""" real: bool endpoint: Optional[str] = None + """formatting deserialized with humantime crate, for example "1s".""" + custom_timeout: Optional[str] = None def access_env_vars(self) -> Dict[str, str]: - return { - "AWS_ACCESS_KEY_ID": self.access_key, - "AWS_SECRET_ACCESS_KEY": self.secret_key, - } + if self.aws_profile is not None: + return { + "AWS_PROFILE": self.aws_profile, + } + if self.access_key is not None and self.secret_key is not None: + return { + "AWS_ACCESS_KEY_ID": self.access_key, + "AWS_SECRET_ACCESS_KEY": self.secret_key, + } + raise RuntimeError( + "Either AWS_PROFILE or (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) have to be set for S3Storage" + ) def to_string(self) -> str: return json.dumps( @@ -185,7 +198,7 @@ class S3Storage: } ) - def to_toml_inline_table(self) -> str: + def to_toml_dict(self) -> Dict[str, Any]: rv = { "bucket_name": self.bucket_name, "bucket_region": self.bucket_region, @@ -197,7 +210,13 @@ class S3Storage: if self.endpoint is not None: rv["endpoint"] = self.endpoint - return toml.TomlEncoder().dump_inline_table(rv) + if self.custom_timeout is not None: + rv["timeout"] = self.custom_timeout + + return rv + + def to_toml_inline_table(self) -> str: + return toml.TomlEncoder().dump_inline_table(self.to_toml_dict()) def do_cleanup(self): if not self.cleanup: @@ -243,6 +262,22 @@ class S3Storage: log.info(f"deleted {cnt} objects from remote storage") + def tenants_path(self) -> str: + return f"{self.prefix_in_bucket}/tenants" + + def tenant_path(self, tenant_id: TenantId) -> str: + return f"{self.tenants_path()}/{tenant_id}" + + def heatmap_key(self, tenant_id: TenantId) -> str: + return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}" + + def heatmap_content(self, tenant_id: TenantId): + r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id)) + return json.loads(r["Body"].read().decode("utf-8")) + + def mock_remote_tenant_path(self, tenant_id: TenantId): + assert self.real is False + RemoteStorage = Union[LocalFsStorage, S3Storage] @@ -308,6 +343,7 @@ class RemoteStorageKind(str, enum.Enum): bucket_region=mock_region, access_key=access_key, secret_key=secret_key, + aws_profile=None, prefix_in_bucket="", client=client, cleanup=False, @@ -317,12 +353,11 @@ class RemoteStorageKind(str, enum.Enum): assert self == RemoteStorageKind.REAL_S3 env_access_key = os.getenv("AWS_ACCESS_KEY_ID") - assert env_access_key, "no aws access key provided" env_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") - assert env_secret_key, "no aws access key provided" - - # session token is needed for local runs with sso auth - session_token = os.getenv("AWS_SESSION_TOKEN") + env_profile = os.getenv("AWS_PROFILE") + assert ( + env_access_key and env_secret_key + ) or env_profile, "need to specify either access key and secret access key or profile" bucket_name = bucket_name or os.getenv("REMOTE_STORAGE_S3_BUCKET") assert bucket_name is not None, "no remote storage bucket name provided" @@ -334,9 +369,6 @@ class RemoteStorageKind(str, enum.Enum): client = boto3.client( "s3", region_name=bucket_region, - aws_access_key_id=env_access_key, - aws_secret_access_key=env_secret_key, - aws_session_token=session_token, ) return S3Storage( @@ -344,6 +376,7 @@ class RemoteStorageKind(str, enum.Enum): bucket_region=bucket_region, access_key=env_access_key, secret_key=env_secret_key, + aws_profile=env_profile, prefix_in_bucket=prefix_in_bucket, client=client, cleanup=True, @@ -391,6 +424,13 @@ def default_remote_storage() -> RemoteStorageKind: return RemoteStorageKind.LOCAL_FS +def remote_storage_to_toml_dict(remote_storage: RemoteStorage) -> Dict[str, Any]: + if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): + raise Exception("invalid remote storage type") + + return remote_storage.to_toml_dict() + + # serialize as toml inline table def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): diff --git a/test_runner/fixtures/safekeeper/__init__.py b/test_runner/fixtures/safekeeper/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py new file mode 100644 index 0000000000..11e6fef28f --- /dev/null +++ b/test_runner/fixtures/safekeeper/http.py @@ -0,0 +1,236 @@ +import json +import re +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple, Union + +import pytest +import requests + +from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log + + +# Walreceiver as returned by sk's timeline status endpoint. +@dataclass +class Walreceiver: + conn_id: int + state: str + + +@dataclass +class SafekeeperTimelineStatus: + term: int + last_log_term: int + pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 + flush_lsn: Lsn + commit_lsn: Lsn + timeline_start_lsn: Lsn + backup_lsn: Lsn + peer_horizon_lsn: Lsn + remote_consistent_lsn: Lsn + walreceivers: List[Walreceiver] + + +@dataclass +class SafekeeperMetrics: + # These are metrics from Prometheus which uses float64 internally. + # As a consequence, values may differ from real original int64s. + flush_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + commit_lsn_inexact: Dict[Tuple[TenantId, TimelineId], int] = field(default_factory=dict) + + +class SafekeeperHttpClient(requests.Session): + HTTPError = requests.HTTPError + + def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False): + super().__init__() + self.port = port + self.auth_token = auth_token + self.is_testing_enabled = is_testing_enabled + + if auth_token is not None: + self.headers["Authorization"] = f"Bearer {auth_token}" + + def check_status(self): + self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + + def is_testing_enabled_or_skip(self): + if not self.is_testing_enabled: + pytest.skip("safekeeper was built without 'testing' feature") + + def configure_failpoints(self, config_strings: Union[Tuple[str, str], List[Tuple[str, str]]]): + self.is_testing_enabled_or_skip() + + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + res.raise_for_status() + res_json = res.json() + assert res_json is None + return res_json + + def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + params = params or {} + res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params) + res.raise_for_status() + res_json = json.loads(res.text) + assert isinstance(res_json, dict) + return res_json + + def patch_control_file( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + patch: Dict[str, Any], + ) -> Dict[str, Any]: + res = self.patch( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file", + json={ + "updates": patch, + "apply_fields": list(patch.keys()), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]: + res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy", + json=body, + ) + res.raise_for_status() + + def timeline_digest( + self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn + ) -> Dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest", + params={ + "from_lsn": str(from_lsn), + "until_lsn": str(until_lsn), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_create( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: int, # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 + commit_lsn: Lsn, + ): + body = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "pg_version": pg_version, + "commit_lsn": str(commit_lsn), + } + res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body) + res.raise_for_status() + + def timeline_status( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> SafekeeperTimelineStatus: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}") + res.raise_for_status() + resj = res.json() + walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] + return SafekeeperTimelineStatus( + term=resj["acceptor_state"]["term"], + last_log_term=resj["acceptor_state"]["epoch"], + pg_version=resj["pg_info"]["pg_version"], + flush_lsn=Lsn(resj["flush_lsn"]), + commit_lsn=Lsn(resj["commit_lsn"]), + timeline_start_lsn=Lsn(resj["timeline_start_lsn"]), + backup_lsn=Lsn(resj["backup_lsn"]), + peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]), + remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]), + walreceivers=walreceivers, + ) + + def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + return self.timeline_status(tenant_id, timeline_id).commit_lsn + + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): + res = self.post( + f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", + json=body, + ) + res.raise_for_status() + + def checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint", + json={}, + ) + res.raise_for_status() + + # only_local doesn't remove segments in the remote storage. + def timeline_delete( + self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False + ) -> Dict[Any, Any]: + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + params={ + "only_local": str(only_local).lower(), + }, + ) + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def tenant_delete_force(self, tenant_id: TenantId) -> Dict[Any, Any]: + res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + res.raise_for_status() + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def get_metrics_str(self) -> str: + request_result = self.get(f"http://localhost:{self.port}/metrics") + request_result.raise_for_status() + return request_result.text + + def get_metrics(self) -> SafekeeperMetrics: + all_metrics_text = self.get_metrics_str() + + metrics = SafekeeperMetrics() + for match in re.finditer( + r'^safekeeper_flush_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): + metrics.flush_lsn_inexact[(TenantId(match.group(1)), TimelineId(match.group(2)))] = int( + match.group(3) + ) + for match in re.finditer( + r'^safekeeper_commit_lsn{tenant_id="([0-9a-f]+)",timeline_id="([0-9a-f]+)"} (\S+)$', + all_metrics_text, + re.MULTILINE, + ): + metrics.commit_lsn_inexact[ + (TenantId(match.group(1)), TimelineId(match.group(2))) + ] = int(match.group(3)) + return metrics diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py new file mode 100644 index 0000000000..0e4b5d7883 --- /dev/null +++ b/test_runner/fixtures/safekeeper/utils.py @@ -0,0 +1,11 @@ +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.safekeeper.http import SafekeeperHttpClient + + +def are_walreceivers_absent( + sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): + status = sk_http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") + return len(status.walreceivers) == 0 diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index cda788b2a4..0989dc1893 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,20 +1,27 @@ import contextlib +import enum import json import os import re import subprocess +import tarfile import threading import time +from hashlib import sha256 from pathlib import Path from typing import ( + IO, TYPE_CHECKING, Any, Callable, Dict, + Iterable, List, Optional, + Set, Tuple, TypeVar, + Union, ) from urllib.parse import urlencode @@ -23,14 +30,14 @@ import zstandard from psycopg2.extensions import cursor from fixtures.log_helper import log -from fixtures.pageserver.types import ( +from fixtures.pageserver.common_types import ( parse_delta_layer, parse_image_layer, ) if TYPE_CHECKING: from fixtures.neon_fixtures import PgBin -from fixtures.types import TimelineId +from fixtures.common_types import TimelineId Fn = TypeVar("Fn", bound=Callable[..., Any]) @@ -189,7 +196,7 @@ def query_scalar(cur: cursor, query: str) -> Any: # Traverse directory to get total size. -def get_dir_size(path: str) -> int: +def get_dir_size(path: Path) -> int: """Return size in bytes.""" totalbytes = 0 for root, _dirs, files in os.walk(path): @@ -369,7 +376,12 @@ def start_in_background( return spawned_process -def wait_until(number_of_iterations: int, interval: float, func: Fn): +WaitUntilRet = TypeVar("WaitUntilRet") + + +def wait_until( + number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet] +) -> WaitUntilRet: """ Wait until 'func' returns successfully, without exception. Returns the last return value from the function. @@ -387,6 +399,18 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn): raise Exception("timed out while waiting for %s" % func) from last_exception +def assert_eq(a, b) -> None: + assert a == b + + +def assert_gt(a, b) -> None: + assert a > b + + +def assert_ge(a, b) -> None: + assert a >= b + + def run_pg_bench_small(pg_bin: "PgBin", connstr: str): """ Fast way to populate data. @@ -397,3 +421,181 @@ def run_pg_bench_small(pg_bin: "PgBin", connstr: str): } """ pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr]) + + +def humantime_to_ms(humantime: str) -> float: + """ + Converts Rust humantime's output string to milliseconds. + + humantime_to_ms("1h 1ms 406us") -> 3600001.406 + """ + + unit_multiplier_map = { + "ns": 1e-6, + "us": 1e-3, + "ms": 1, + "s": 1e3, + "m": 1e3 * 60, + "h": 1e3 * 60 * 60, + } + matcher = re.compile(rf"^(\d+)({'|'.join(unit_multiplier_map.keys())})$") + total_ms = 0.0 + + if humantime == "0": + return total_ms + + for item in humantime.split(): + if (match := matcher.search(item)) is not None: + n, unit = match.groups() + total_ms += int(n) * unit_multiplier_map[unit] + else: + raise ValueError( + f"can't parse '{item}' (from string '{humantime}'), known units are {', '.join(unit_multiplier_map.keys())}." + ) + + return round(total_ms, 3) + + +def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]: + # FIXME: this duplicates test_runner/fixtures/pageserver/allowed_errors.py + error_or_warn = re.compile(r"\s(ERROR|WARN)") + errors = [] + for lineno, line in enumerate(input, start=1): + if len(line) == 0: + continue + + if error_or_warn.search(line): + # Is this a torn log line? This happens when force-killing a process and restarting + # Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192" + if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line): + continue + + # It's an ERROR or WARN. Is it in the allow-list? + for a in allowed_errors: + if re.match(a, line): + break + else: + errors.append((lineno, line)) + return errors + + +def assert_no_errors(log_file, service, allowed_errors): + if not log_file.exists(): + log.warning(f"Skipping {service} log check: {log_file} does not exist") + return + + with log_file.open("r") as f: + errors = scan_log_for_errors(f, allowed_errors) + + for _lineno, error in errors: + log.info(f"not allowed {service} error: {error.strip()}") + + assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add" + + +@enum.unique +class AuxFileStore(str, enum.Enum): + V1 = "v1" + V2 = "v2" + CrossValidation = "cross-validation" + + def __repr__(self) -> str: + return f"'aux-{self.value}'" + + def __str__(self) -> str: + return f"'aux-{self.value}'" + + +def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: Set[str]): + """ + This is essentially: + + lines=$(comm -3 \ + <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \ + | wc -l) + [ "$lines" = "0" ] + + But in a more mac friendly fashion. + """ + started_at = time.time() + + def hash_extracted(reader: Union[IO[bytes], None]) -> bytes: + assert reader is not None + digest = sha256(usedforsecurity=False) + while True: + buf = reader.read(64 * 1024) + if not buf: + break + digest.update(buf) + return digest.digest() + + def build_hash_list(p: Path) -> List[Tuple[str, bytes]]: + with tarfile.open(p) as f: + matching_files = (info for info in f if info.isreg() and info.name not in skip_files) + ret = list( + map(lambda info: (info.name, hash_extracted(f.extractfile(info))), matching_files) + ) + ret.sort(key=lambda t: t[0]) + return ret + + left_list, right_list = map(build_hash_list, [left, right]) + + assert len(left_list) == len( + right_list + ), f"unexpected number of files on tar files, {len(left_list)} != {len(right_list)}" + + mismatching = set() + + for left_tuple, right_tuple in zip(left_list, right_list): + left_path, left_hash = left_tuple + right_path, right_hash = right_tuple + assert ( + left_path == right_path + ), f"file count matched, expected these to be same paths: {left_path}, {right_path}" + if left_hash != right_hash: + mismatching.add(left_path) + + assert len(mismatching) == 0, f"files with hash mismatch: {mismatching}" + + elapsed = time.time() - started_at + log.info(f"assert_pageserver_backups_equal completed in {elapsed}s") + + +class PropagatingThread(threading.Thread): + _target: Any + _args: Any + _kwargs: Any + """ + Simple Thread wrapper with join() propagating the possible exception in the thread. + """ + + def run(self): + self.exc = None + try: + self.ret = self._target(*self._args, **self._kwargs) + except BaseException as e: + self.exc = e + + def join(self, timeout=None): + super(PropagatingThread, self).join(timeout) + if self.exc: + raise self.exc + return self.ret + + +def human_bytes(amt: float) -> str: + """ + Render a bytes amount into nice IEC bytes string. + """ + + suffixes = ["", "Ki", "Mi", "Gi"] + + last = suffixes[-1] + + for name in suffixes: + if amt < 1024 or name == last: + return f"{int(round(amt))} {name}B" + amt = amt / 1024 + + raise RuntimeError("unreachable") diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 241531437c..dfd9caba3e 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -1,14 +1,20 @@ -from typing import Optional +import threading +from typing import Any, Optional +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, NeonEnv, last_flush_lsn_upload, + tenant_get_shards, wait_for_last_flush_lsn, ) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload -from fixtures.types import TenantId, TimelineId + +# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex +# to ensure we don't do that: this enables running lots of Workloads in parallel safely. +ENDPOINT_LOCK = threading.Lock() class Workload: @@ -20,28 +26,53 @@ class Workload: - reads, checking we get the right data (`validate`) """ - def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): + def __init__( + self, + env: NeonEnv, + tenant_id: TenantId, + timeline_id: TimelineId, + branch_name: Optional[str] = None, + endpoint_opts: Optional[dict[str, Any]] = None, + ): self.env = env self.tenant_id = tenant_id self.timeline_id = timeline_id self.table = "foo" + # By default, use the default branch name for initial tenant in NeonEnv + self.branch_name = branch_name or "main" + self.expect_rows = 0 self.churn_cursor = 0 self._endpoint: Optional[Endpoint] = None + self._endpoint_opts = endpoint_opts or {} - def endpoint(self, pageserver_id: int) -> Endpoint: - if self._endpoint is None: - self._endpoint = self.env.endpoints.create( - "main", - tenant_id=self.tenant_id, - pageserver_id=pageserver_id, - endpoint_id="ep-workload", - ) - self._endpoint.start(pageserver_id=pageserver_id) - else: - self._endpoint.reconfigure(pageserver_id=pageserver_id) + def reconfigure(self): + """ + Request the endpoint to reconfigure based on location reported by storage controller + """ + if self._endpoint is not None: + with ENDPOINT_LOCK: + self._endpoint.reconfigure() + + def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint: + # We may be running alongside other Workloads for different tenants. Full TTID is + # obnoxiously long for use here, but a cut-down version is still unique enough for tests. + endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}" + + with ENDPOINT_LOCK: + if self._endpoint is None: + self._endpoint = self.env.endpoints.create( + self.branch_name, + tenant_id=self.tenant_id, + pageserver_id=pageserver_id, + endpoint_id=endpoint_id, + **self._endpoint_opts, + ) + self._endpoint.start(pageserver_id=pageserver_id) + else: + self._endpoint.reconfigure(pageserver_id=pageserver_id) connstring = self._endpoint.safe_psql( "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" @@ -50,11 +81,15 @@ class Workload: return self._endpoint - def __del__(self): + def stop(self): if self._endpoint is not None: self._endpoint.stop() + self._endpoint = None - def init(self, pageserver_id: int): + def __del__(self): + self.stop() + + def init(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);") @@ -63,7 +98,7 @@ class Workload: self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id ) - def write_rows(self, n, pageserver_id): + def write_rows(self, n, pageserver_id: Optional[int] = None, upload: bool = True): endpoint = self.endpoint(pageserver_id) start = self.expect_rows end = start + n - 1 @@ -77,11 +112,14 @@ class Workload: """ ) - return last_flush_lsn_upload( - self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id - ) + if upload: + return last_flush_lsn_upload( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + else: + return False - def churn_rows(self, n, pageserver_id, upload=True): + def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True): assert self.expect_rows >= n max_iters = 10 @@ -119,21 +157,30 @@ class Workload: ] ) - last_flush_lsn = wait_for_last_flush_lsn( - self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id - ) - ps_http = self.env.get_pageserver(pageserver_id).http_client() - wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn) + if ingest: + # Wait for written data to be ingested by the pageserver + for tenant_shard_id, pageserver in tenant_get_shards( + self.env, self.tenant_id, pageserver_id + ): + last_flush_lsn = wait_for_last_flush_lsn( + self.env, + endpoint, + self.tenant_id, + self.timeline_id, + pageserver_id=pageserver_id, + ) + ps_http = pageserver.http_client() + wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) - if upload: - # force a checkpoint to trigger upload - ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id) - wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn) - log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") - else: - log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") + if upload: + # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload) + ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id) + wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn) + log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") + else: + log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") - def validate(self, pageserver_id): + def validate(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) result = endpoint.safe_psql_many( [ diff --git a/test_runner/performance/pageserver/README.md b/test_runner/performance/pageserver/README.md new file mode 100644 index 0000000000..fdd09cd946 --- /dev/null +++ b/test_runner/performance/pageserver/README.md @@ -0,0 +1,16 @@ +How to reproduce benchmark results / run these benchmarks interactively. + +1. Get an EC2 instance with Instance Store. Use the same instance type as used for the benchmark run. +2. Mount the Instance Store => `neon.git/scripts/ps_ec2_setup_instance_store` +3. Use a pytest command line (see other READMEs further up in the pytest hierarchy). + +For tests that take a long time to set up / consume a lot of storage space, +we use the test suite's repo_dir snapshotting functionality (`from_repo_dir`). +It supports mounting snapshots using overlayfs, which improves iteration time. + +Here's a full command line. + +``` +RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \ + ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +```` diff --git a/test_runner/performance/pageserver/__init__.py b/test_runner/performance/pageserver/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/performance/pageserver/interactive/__init__.py b/test_runner/performance/pageserver/interactive/__init__.py new file mode 100644 index 0000000000..29644c240e --- /dev/null +++ b/test_runner/performance/pageserver/interactive/__init__.py @@ -0,0 +1,8 @@ +""" +Tests that aren't really tests or benchmarks. + +They're intended for the case where we want to standardize & automate setup, +but then debug a performance problem interactively. +It's kind of an abuse of the test framework, but, it's our only tool right +now to automate a complex test bench setup. +""" diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py new file mode 100644 index 0000000000..0ff9c8fdaa --- /dev/null +++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py @@ -0,0 +1,79 @@ +import os +import pdb + +import fixtures.pageserver.many_tenants as many_tenants +import pytest +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + last_flush_lsn_upload, +) + +from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking + +""" +Usage: +DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \ + ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py +""" + + +@pytest.mark.skipif( + os.environ.get("INTERACTIVE", "false") != "true", + reason="test is for interactive use only", +) +def test_many_small_tenants( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + _env = setup_env(neon_env_builder, 2) # vary this to the desired number of tenants + _pg_bin = pg_bin + + # drop into pdb so that we can debug pageserver interactively, use pdb here + # For example, to interactively examine pageserver startup behavior, call + # _env.pageserver.stop(immediate=True) + # _env.pageserver.start() + # from the pdb shell. + pdb.set_trace() + + +def setup_env( + neon_env_builder: NeonEnvBuilder, + n_tenants: int, +) -> NeonEnv: + def setup_template(env: NeonEnv): + # create our template tenant + config = { + "gc_period": "0s", + "checkpoint_timeout": "10 years", + "compaction_period": "20 s", + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + ep = env.endpoints.create_start("main", tenant_id=template_tenant) + ep.safe_psql("create table foo(b text)") + for _ in range(0, 8): + ep.safe_psql("insert into foo(b) values ('some text')") + last_flush_lsn_upload(env, ep, template_tenant, template_timeline) + ep.stop_and_destroy() + return (template_tenant, template_timeline, config) + + def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants) + + env = neon_env_builder.build_and_use_snapshot(f"many-small-tenants-{n_tenants}", doit) + + env.start() + ensure_pageserver_ready_for_benchmarking(env, n_tenants) + + return env diff --git a/test_runner/performance/pageserver/pagebench/__init__.py b/test_runner/performance/pageserver/pagebench/__init__.py new file mode 100644 index 0000000000..9f5e45c0a0 --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/__init__.py @@ -0,0 +1,10 @@ +""" +Pagebench-based performance regression tests. + +The defining characteristic of tests in this sub-directory is that they +are component-level tests, i.e., they exercise pageserver directly using `pagebench` +instead of benchmarking the full stack. + +See https://github.com/neondatabase/neon/issues/5771 +for the context in which this was developed. +""" diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py new file mode 100644 index 0000000000..b66db4d0ab --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -0,0 +1,195 @@ +import asyncio +import json +from pathlib import Path +from typing import Any, Dict, Tuple + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.utils import get_scale_for_db, humantime_to_ms + +from performance.pageserver.util import ( + setup_pageserver_with_tenants, +) + + +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [10]) +@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) +@pytest.mark.timeout(1000) +def test_basebackup_with_high_slru_count( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + get_vectored_impl: str, + n_tenants: int, + pgbench_scale: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record(metric_name=f"pageserver_basebackup.{metric}", **kwargs) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + "n_tenants": (n_tenants, {"unit": ""}), + "pgbench_scale": (pgbench_scale, {"unit": ""}), + "duration": (duration, {"unit": "s"}), + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " + f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + n_txns = 500000 + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, n_txns) + + env = setup_pageserver_with_tenants( + neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + ) + run_benchmark(env, pg_bin, record, duration) + + +def setup_tenant_template(env: NeonEnv, n_txns: int): + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + + ps_http = env.pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=template_tenant, config_lines=["shared_buffers=1MB"] + ) as ep: + rels = 10 + + asyncio.run(run_updates(ep, n_txns, rels)) + + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + + return (template_tenant, template_timeline, config) + + +# Takes about 5 minutes and produces tenants with around 300 SLRU blocks +# of 8 KiB each. +async def run_updates(ep: Endpoint, n_txns: int, workers_count: int): + workers = [] + for i in range(workers_count): + workers.append(asyncio.create_task(run_update_loop_worker(ep, n_txns, i))) + + await asyncio.gather(*workers) + + +async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int): + table = f"t_{idx}" + conn = await ep.connect_async() + await conn.execute(f"CREATE TABLE {table} (pk integer PRIMARY KEY, x integer)") + await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)") + await conn.execute(f"INSERT INTO {table} VALUES (1, 0)") + await conn.execute( + f""" + CREATE PROCEDURE updating{table}() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..{n_txns} LOOP + UPDATE {table} SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """ + ) + await conn.execute("SET statement_timeout=0") + await conn.execute(f"call updating{table}()") + + +def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int): + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "basebackup", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--gzip-probability", + "1", + "--runtime", + f"{duration_secs}s", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + total = results["total"] + metric = "request_count" + record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "latency_mean" + record( + metric, + metric_value=humantime_to_ms(total[metric]), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "latency_percentiles" + for k, v in total[metric].items(): + record( + f"{metric}.{k}", + metric_value=humantime_to_ms(v), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py new file mode 100644 index 0000000000..644c1f559b --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py @@ -0,0 +1,175 @@ +import json +from pathlib import Path +from typing import Any, Dict, Tuple + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.pageserver.utils import wait_for_upload_queue_empty +from fixtures.remote_storage import s3_storage +from fixtures.utils import humantime_to_ms + + +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("io_engine", ["tokio-epoll-uring", "std-fs"]) +@pytest.mark.parametrize("concurrency_per_target", [1, 10, 100]) +@pytest.mark.timeout(1000) +def test_download_churn( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + io_engine: str, + concurrency_per_target: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record(metric_name=f"pageserver_ondemand_download_churn.{metric}", **kwargs) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + # we don't capture `duration`, but instead use the `runtime` output field from pagebench + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + # Setup env + env = setup_env(neon_env_builder, pg_bin) + env.pageserver.allowed_errors.append( + f".*path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" + ) + + run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration) + + +def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # We configure tenant conf such that SQL query below produces a lot of layers. + # We don't care what's in the layers really, we just care that layers are created. + bytes_per_layer = 10 * (1024**2) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "pitr_interval": "1000d", # let's not make it get in the way + "gc_period": "0s", # disable periodic gc to avoid noise + "compaction_period": "0s", # disable L0=>L1 compaction + "checkpoint_timeout": "10years", # rely solely on checkpoint_distance + "checkpoint_distance": bytes_per_layer, # 10M instead of 256M to create more smaller layers + "image_creation_threshold": 100000, # don't create image layers ever + } + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + ep.safe_psql("CREATE TABLE data (random_text text)") + bytes_per_row = 512 # make big enough so WAL record size doesn't dominate + desired_layers = 300 + desired_bytes = bytes_per_layer * desired_layers + nrows = desired_bytes / bytes_per_row + ep.safe_psql( + f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)}) as i", + options="-c statement_timeout=0", + ) + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here + wait_for_upload_queue_empty(client, tenant_id, timeline_id) + + return env + + +def run_benchmark( + env: NeonEnv, + pg_bin: PgBin, + record, + io_engine: str, + concurrency_per_target: int, + duration_secs: int, +): + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "ondemand-download-churn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--runtime", + f"{duration_secs}s", + "--set-io-engine", + f"{io_engine}", + "--concurrency-per-target", + f"{concurrency_per_target}", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + metric = "downloads_count" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "downloads_bytes" + record( + metric, + metric_value=results[metric], + unit="byte", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "evictions_count" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "timeline_restarts" + record( + metric, + metric_value=results[metric], + unit="", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "runtime" + record( + metric, + metric_value=humantime_to_ms(results[metric]) / 1000, + unit="s", + report=MetricReport.TEST_PARAM, + ) diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py new file mode 100644 index 0000000000..68f3d9dcbe --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -0,0 +1,219 @@ +import json +from pathlib import Path +from typing import Any, Dict, Tuple + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + wait_for_last_flush_lsn, +) +from fixtures.utils import get_scale_for_db, humantime_to_ms + +from performance.pageserver.util import ( + setup_pageserver_with_tenants, +) + + +# For reference, the space usage of the snapshots: +# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots +# 137G /instance_store/test_output/shared-snapshots +# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/* +# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13 +# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6 +# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13 +# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6 +# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13 +# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6 +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]]) +@pytest.mark.parametrize("n_tenants", [1, 10]) +@pytest.mark.timeout( + 10000 +) # TODO: this value is just "a really high number"; have this per instance type +def test_pageserver_max_throughput_getpage_at_latest_lsn( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record( + metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs + ) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + "n_tenants": (n_tenants, {"unit": ""}), + "pgbench_scale": (pgbench_scale, {"unit": ""}), + "duration": (duration, {"unit": "s"}), + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, pg_bin, pgbench_scale) + + env = setup_pageserver_with_tenants( + neon_env_builder, + f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}", + n_tenants, + setup_wrapper, + ) + + env.pageserver.allowed_errors.append( + # https://github.com/neondatabase/neon/issues/6925 + # https://github.com/neondatabase/neon/issues/6390 + # https://github.com/neondatabase/neon/issues/6724 + r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" + ) + + run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) + + +def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): + """ + Set up a template tenant which will be replicated by the test infra. + It's a pgbench tenant, initialized to a certain scale, and treated afterwards + with a repeat application of (pgbench simple-update workload, checkpoint, compact). + """ + # use a config that makes production of on-disk state timing-insensitive + # as we ingest data into the tenant. + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + ps_http = env.pageserver.http_client() + with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + for _ in range( + 0, 17 + ): # some prime number to avoid potential resonances with the "_threshold" variables from the config + # the L0s produced by this appear to have size ~5MiB + num_txns = 10_000 + pg_bin.run_capture( + ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] + ) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + # for reference, the output at scale=6 looked like so (306M total) + # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 + # total 306M + # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 + # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 + # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 + # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 + # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 + # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 + # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 + # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 + # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 + # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 + + return (template_tenant, template_timeline, config) + + +def run_benchmark_max_throughput_latest_lsn( + env: NeonEnv, pg_bin: PgBin, record, duration_secs: int +): + """ + Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`. + """ + + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "get-page-latest-lsn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--runtime", + f"{duration_secs}s", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + total = results["total"] + + metric = "request_count" + record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "latency_mean" + record( + metric, + metric_value=humantime_to_ms(total[metric]), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "latency_percentiles" + for k, v in total[metric].items(): + record( + f"{metric}.{k}", + metric_value=humantime_to_ms(v), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + env.storage_controller.allowed_errors.append( + # The test setup swaps NeonEnv instances, hence different + # pg instances are used for the storage controller db. This means + # the storage controller doesn't know about the nodes mentioned + # in attachments.json at start-up. + ".* Scheduler missing node 1", + ) diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py new file mode 100644 index 0000000000..f31cd9a9f8 --- /dev/null +++ b/test_runner/performance/pageserver/util.py @@ -0,0 +1,55 @@ +""" +Utilities used by all code in this sub-directory +""" + +from typing import Any, Callable, Dict, Tuple + +import fixtures.pageserver.many_tenants as many_tenants +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, +) +from fixtures.pageserver.utils import wait_until_all_tenants_state + + +def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): + """ + Helper function. + """ + ps_http = env.pageserver.http_client() + + log.info("wait for all tenants to become active") + wait_until_all_tenants_state( + ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False + ) + + # ensure all layers are resident for predictiable performance + tenants = [info["id"] for info in ps_http.tenant_list()] + for tenant in tenants: + for timeline in ps_http.tenant_status(tenant)["timelines"]: + info = ps_http.layer_map_info(tenant, timeline) + for layer in info.historic_layers: + assert not layer.remote + + log.info("ready") + + +def setup_pageserver_with_tenants( + neon_env_builder: NeonEnvBuilder, + name: str, + n_tenants: int, + setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]], +) -> NeonEnv: + """ + Utility function to set up a pageserver with a given number of identical tenants. + """ + + def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + return many_tenants.single_timeline(neon_env_builder, setup, n_tenants) + + env = neon_env_builder.build_and_use_snapshot(name, doit) + env.start() + ensure_pageserver_ready_for_benchmarking(env, n_tenants) + return env diff --git a/test_runner/performance/pgvector/HNSW_build.sql b/test_runner/performance/pgvector/HNSW_build.sql new file mode 100644 index 0000000000..9e6918b755 --- /dev/null +++ b/test_runner/performance/pgvector/HNSW_build.sql @@ -0,0 +1,47 @@ + +\set ECHO queries +\timing + +-- prepare test table +DROP TABLE IF EXISTS hnsw_test_table; +CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA; +INSERT INTO hnsw_test_table SELECT * FROM documents; +CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries +-- tune index build params +SET max_parallel_maintenance_workers = 7; +SET maintenance_work_mem = '8GB'; +-- create HNSW index for the supported distance metrics +CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops); +CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops); +CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops); +CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops); +CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops); +-- note: in a second psql session we can monitor the progress of the index build phases using +-- the following query: +-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index; + +-- show all indexes built on the table +SELECT + idx.relname AS index_name, + tbl.relname AS table_name, + am.amname AS access_method, + a.attname AS column_name, + opc.opcname AS operator_class +FROM + pg_index i +JOIN + pg_class idx ON idx.oid = i.indexrelid +JOIN + pg_class tbl ON tbl.oid = i.indrelid +JOIN + pg_am am ON am.oid = idx.relam +JOIN + pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey) +JOIN + pg_opclass opc ON opc.oid = i.indclass[0] +WHERE + tbl.relname = 'hnsw_test_table' + AND a.attname = 'embeddings'; + +-- show table sizes +\dt+ diff --git a/test_runner/performance/pgvector/IVFFLAT_build.sql b/test_runner/performance/pgvector/IVFFLAT_build.sql new file mode 100644 index 0000000000..338980831a --- /dev/null +++ b/test_runner/performance/pgvector/IVFFLAT_build.sql @@ -0,0 +1,52 @@ + +\set ECHO queries +\timing + +-- prepare test table +DROP TABLE IF EXISTS ivfflat_test_table; +CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA; +INSERT INTO ivfflat_test_table SELECT * FROM documents; +CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries +-- tune index build params +SET max_parallel_maintenance_workers = 7; +SET maintenance_work_mem = '8GB'; +-- create ivfflat index for the supported distance metrics +-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million +-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents +-- so we use 1000 lists +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table + USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000); + +\d ivfflat_test_table + + +-- show all indexes built on the table +SELECT + idx.relname AS index_name, + tbl.relname AS table_name, + am.amname AS access_method, + a.attname AS column_name, + opc.opcname AS operator_class +FROM + pg_index i +JOIN + pg_class idx ON idx.oid = i.indexrelid +JOIN + pg_class tbl ON tbl.oid = i.indrelid +JOIN + pg_am am ON am.oid = idx.relam +JOIN + pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey) +JOIN + pg_opclass opc ON opc.oid = i.indclass[0] +WHERE + tbl.relname = 'ivfflat_test_table' + AND a.attname = 'embeddings'; +-- show table sizes +\dt+ + + diff --git a/test_runner/performance/pgvector/README.md b/test_runner/performance/pgvector/README.md new file mode 100644 index 0000000000..83495d270a --- /dev/null +++ b/test_runner/performance/pgvector/README.md @@ -0,0 +1,55 @@ +# Source of the dataset for pgvector tests + +This readme was copied from https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M + +## Download the parquet files + +```bash +brew install git-lfs +git-lfs clone https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M +``` + +## Load into postgres: + +see loaddata.py in this directory + +## Rest of dataset card as on huggingface + +--- +dataset_info: + features: + - name: _id + dtype: string + - name: title + dtype: string + - name: text + dtype: string + - name: text-embedding-3-large-1536-embedding + sequence: float64 + splits: + - name: train + num_bytes: 12679725776 + num_examples: 1000000 + download_size: 9551862565 + dataset_size: 12679725776 +configs: +- config_name: default + data_files: + - split: train + path: data/train-* +license: mit +task_categories: +- feature-extraction +language: +- en +size_categories: +- 1M ") + + +def main(conn_str, directory_path): + # Connection to PostgreSQL + with psycopg2.connect(conn_str) as conn: + with conn.cursor() as cursor: + # Run SQL statements + cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;") + register_vector(conn) + cursor.execute("DROP TABLE IF EXISTS documents;") + cursor.execute( + """ + CREATE TABLE documents ( + _id TEXT PRIMARY KEY, + title TEXT, + text TEXT, + embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI) + ); + """ + ) + conn.commit() + + # List and sort Parquet files + parquet_files = sorted(Path(directory_path).glob("*.parquet")) + + for file in parquet_files: + print(f"Loading {file} into PostgreSQL") + df = pd.read_parquet(file) + + print(df.head()) + + data_list = [ + ( + row["_id"], + row["title"], + row["text"], + np.array(row["text-embedding-3-large-1536-embedding"]), + ) + for index, row in df.iterrows() + ] + # Use execute_values to perform batch insertion + execute_values( + cursor, + "INSERT INTO documents (_id, title, text, embeddings) VALUES %s", + data_list, + ) + # Commit after we insert all embeddings + conn.commit() + + print(f"Loaded {file} into PostgreSQL") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print_usage() + sys.exit(1) + + conn_str = sys.argv[1] + directory_path = sys.argv[2] + main(conn_str, directory_path) diff --git a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql new file mode 100644 index 0000000000..70d0c18149 --- /dev/null +++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql @@ -0,0 +1,13 @@ +-- run with pooled connection +-- pgbench -T 300 -c 100 -j20 -f pgbench_halfvec_queries.sql -postgresql://neondb_owner:@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require" + +with x (x) as ( + select "embeddings" as x + from halfvec_test_table + TABLESAMPLE SYSTEM (1) + LIMIT 1 +) +SELECT title, "embeddings" <=> (select x from x) as distance +FROM halfvec_test_table +ORDER BY 2 +LIMIT 30; \ No newline at end of file diff --git a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql new file mode 100644 index 0000000000..886ae9645b --- /dev/null +++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql @@ -0,0 +1,10 @@ +with x (x) as ( + select "embeddings" as x + from hnsw_test_table + TABLESAMPLE SYSTEM (1) + LIMIT 1 +) +SELECT title, "embeddings" <=> (select x from x) as distance +FROM hnsw_test_table +ORDER BY 2 +LIMIT 30; diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 6edcb8f1f2..b3866f1813 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -1,4 +1,5 @@ import random +import re import statistics import threading import time @@ -7,11 +8,14 @@ from contextlib import closing from typing import List import pytest -from fixtures.benchmark_fixture import MetricReport +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonPageserver from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn +from fixtures.utils import wait_until +from prometheus_client.samples import Sample def _record_branch_creation_durations(neon_compare: NeonCompare, durs: List[float]): @@ -74,7 +78,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{p}", tenant_id=tenant) dur = timeit.default_timer() - timer log.info(f"Creating branch b{i+1} took {dur}s") @@ -89,11 +93,17 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) _record_branch_creation_durations(neon_compare, branch_creation_durations) -@pytest.mark.parametrize("n_branches", [1024]) -# Test measures the latency of branch creation when creating a lot of branches. -def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): +@pytest.mark.parametrize("n_branches", [500, 1024]) +@pytest.mark.parametrize("shape", ["one_ancestor", "random"]) +def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: str): + """ + Test measures the latency of branch creation when creating a lot of branches. + """ env = neon_compare.env + # seed the prng so we will measure the same structure every time + rng = random.Random("2024-02-29") + env.neon_cli.create_branch("b0") endpoint = env.endpoints.create_start("b0") @@ -102,15 +112,105 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int): branch_creation_durations = [] for i in range(n_branches): - # random a source branch - p = random.randint(0, i) + if shape == "random": + parent = f"b{rng.randint(0, i)}" + elif shape == "one_ancestor": + parent = "b0" + else: + raise RuntimeError(f"unimplemented shape: {shape}") + timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p)) + # each of these uploads to remote storage before completion + env.neon_cli.create_branch(f"b{i + 1}", parent) dur = timeit.default_timer() - timer branch_creation_durations.append(dur) _record_branch_creation_durations(neon_compare, branch_creation_durations) + endpoint.stop_and_destroy() + + with neon_compare.record_duration("shutdown"): + # this sleeps 100ms between polls + env.pageserver.stop() + + startup_line = "INFO version: git(-env)?:" + + # find the first line of the log file so we can find the next start later + _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line)) + + # start without gc so we can time compaction with less noise; use shorter + # period for compaction so it starts earlier + def patch_default_tenant_config(config): + tenant_config = config.get("tenant_config", {}) + tenant_config["compaction_period"] = "3s" + tenant_config["gc_period"] = "0s" + config["tenant_config"] = tenant_config + + env.pageserver.edit_config_toml(patch_default_tenant_config) + env.pageserver.start( + # this does print more than we want, but the number should be comparable between runs + extra_env_vars={ + "RUST_LOG": f"[compaction_loop{{tenant_id={env.initial_tenant}}}]=debug,info" + }, + ) + + _, second_start = wait_until( + 5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start) + ) + env.pageserver.quiesce_tenants() + + wait_and_record_startup_metrics(env.pageserver, neon_compare.zenbenchmark, "restart_after") + + # wait for compaction to complete, which most likely has already done so multiple times + msg, _ = wait_until( + 30, + 1, + lambda: env.pageserver.assert_log_contains( + f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start + ), + ) + needle = re.search(" elapsed_ms=([0-9]+)", msg) + assert needle is not None, "failed to find the elapsed time" + duration = int(needle.group(1)) / 1000.0 + neon_compare.zenbenchmark.record("compaction", duration, "s", MetricReport.LOWER_IS_BETTER) + + +def wait_and_record_startup_metrics( + pageserver: NeonPageserver, target: NeonBenchmarker, prefix: str +): + """ + Waits until all startup metrics have non-zero values on the pageserver, then records them on the target + """ + + client = pageserver.http_client() + + expected_labels = set( + [ + "background_jobs_can_start", + "complete", + "initial", + "initial_tenant_load", + "initial_tenant_load_remote", + ] + ) + + def metrics_are_filled() -> List[Sample]: + m = client.get_metrics() + samples = m.query_all("pageserver_startup_duration_seconds") + # we should not have duplicate labels + matching = [ + x for x in samples if x.labels.get("phase") in expected_labels and x.value > 0.0 + ] + assert len(matching) == len(expected_labels) + return matching + + samples = wait_until(10, 1, metrics_are_filled) + + for sample in samples: + phase = sample.labels["phase"] + name = f"{prefix}.{phase}" + target.record(name, sample.value, "s", MetricReport.LOWER_IS_BETTER) + # Test measures the branch creation time when branching from a timeline with a lot of relations. # diff --git a/test_runner/performance/test_bulk_insert.py b/test_runner/performance/test_bulk_insert.py index edc23b29ba..3f56da7c1d 100644 --- a/test_runner/performance/test_bulk_insert.py +++ b/test_runner/performance/test_bulk_insert.py @@ -1,10 +1,11 @@ from contextlib import closing +import pytest from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare from fixtures.pageserver.utils import wait_tenant_status_404 from fixtures.pg_version import PgVersion -from fixtures.types import Lsn # @@ -17,6 +18,7 @@ from fixtures.types import Lsn # 3. Disk space used # 4. Peak memory usage # +@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/7124") def test_bulk_insert(neon_with_baseline: PgCompare): env = neon_with_baseline @@ -56,12 +58,12 @@ def measure_recovery_time(env: NeonCompare): # Delete the Tenant in the pageserver: this will drop local and remote layers, such that # when we "create" the Tenant again, we will replay the WAL from the beginning. # - # This is a "weird" thing to do, and can confuse the attachment service as we're re-using + # This is a "weird" thing to do, and can confuse the storage controller as we're re-using # the same tenant ID for a tenant that is logically different from the pageserver's point # of view, but the same as far as the safekeeper/WAL is concerned. To work around that, # we will explicitly create the tenant in the same generation that it was previously # attached in. - attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant) + attach_status = env.env.storage_controller.inspect(tenant_shard_id=env.tenant) assert attach_status is not None (attach_gen, _) = attach_status diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index cf9e4808fc..9a03994b29 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -1,3 +1,5 @@ +import json + import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log @@ -13,6 +15,11 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma Information about image layers needed to collect old layers should be propagated by GC to compaction task which should take in in account when make a decision which new image layers needs to be created. + + NB: this test demonstrates the problem. The source tree contained the + `gc_feedback` mechanism for about 9 months, but, there were problems + with it and it wasn't enabled at runtime. + This PR removed the code: https://github.com/neondatabase/neon/pull/6863 """ env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -68,9 +75,31 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"] log.info(f"Physical storage size {physical_size}") + max_num_of_deltas_above_image = 0 + max_total_num_of_deltas = 0 + for key_range in client.perf_info(tenant_id, timeline_id): + max_total_num_of_deltas = max(max_total_num_of_deltas, key_range["total_num_of_deltas"]) + max_num_of_deltas_above_image = max( + max_num_of_deltas_above_image, key_range["num_of_deltas_above_image"] + ) + MB = 1024 * 1024 zenbenchmark.record("logical_size", logical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER) zenbenchmark.record("physical_size", physical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER) zenbenchmark.record( "physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER ) + zenbenchmark.record( + "max_total_num_of_deltas", max_total_num_of_deltas, "", MetricReport.LOWER_IS_BETTER + ) + zenbenchmark.record( + "max_num_of_deltas_above_image", + max_num_of_deltas_above_image, + "", + MetricReport.LOWER_IS_BETTER, + ) + + layer_map_path = env.repo_dir / "layer-map.json" + log.info(f"Writing layer map to {layer_map_path}") + with layer_map_path.open("w") as f: + f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id))) diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index 6bd0d85fa2..9b20954d45 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -17,10 +17,10 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): tenant, _ = env.neon_cli.create_tenant( conf={ "gc_period": "0s", - "checkpoint_distance": "8192", + "checkpoint_distance": "16384", "compaction_period": "1 s", "compaction_threshold": "1", - "compaction_target_size": "8192", + "compaction_target_size": "16384", } ) diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py new file mode 100644 index 0000000000..e929bd4d05 --- /dev/null +++ b/test_runner/performance/test_lazy_startup.py @@ -0,0 +1,106 @@ +import pytest +import requests +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.neon_fixtures import NeonEnvBuilder + + +# Start and measure duration with huge SLRU segments. +# This test is similar to test_startup_simple, but it creates huge number of transactions +# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation. +# +# This test runs pretty quickly and can be informative when used in combination +# with emulated network delay. Some useful delay commands: +# +# 1. Add 2msec delay to all localhost traffic +# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec` +# +# 2. Test that it works (you should see 4ms ping) +# `ping localhost` +# +# 3. Revert back to normal +# `sudo tc qdisc del dev lo root netem` +# +# NOTE this test might not represent the real startup time because the basebackup +# for a large database might be larger if there's a lof of transaction metadata, +# or safekeepers might need more syncing, or there might be more operations to +# apply during config step, like more users, databases, or extensions. By default +# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this +# test we only load neon. +@pytest.mark.timeout(1800) +@pytest.mark.parametrize("slru", ["lazy", "eager"]) +def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + lazy_slru_download = "true" if slru == "lazy" else "false" + tenant, _ = env.neon_cli.create_tenant( + conf={ + "lazy_slru_download": lazy_slru_download, + } + ) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant) + with endpoint.cursor() as cur: + cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)") + cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)") + cur.execute("INSERT INTO t VALUES (1, 0)") + cur.execute( + """ + CREATE PROCEDURE updating() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..1000000 LOOP + UPDATE t SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """ + ) + cur.execute("SET statement_timeout=0") + cur.execute("call updating()") + + endpoint.stop() + + # We do two iterations so we can see if the second startup is faster. It should + # be because the compute node should already be configured with roles, databases, + # extensions, etc from the first run. + for i in range(2): + # Start + with zenbenchmark.record_duration(f"{slru}_{i}_start"): + endpoint.start() + + with zenbenchmark.record_duration(f"{slru}_{i}_select"): + sum = endpoint.safe_psql("select sum(x) from t")[0][0] + assert sum == 1000000 + + # Get metrics + metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + durations = { + "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec", + "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers", + "sync_sk_check_ms": f"{slru}_{i}_sync_sk_check", + "basebackup_ms": f"{slru}_{i}_basebackup", + "start_postgres_ms": f"{slru}_{i}_start_postgres", + "config_ms": f"{slru}_{i}_config", + "total_startup_ms": f"{slru}_{i}_total_startup", + } + for key, name in durations.items(): + value = metrics[key] + zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER) + + basebackup_bytes = metrics["basebackup_bytes"] + zenbenchmark.record( + f"{slru}_{i}_basebackup_bytes", + basebackup_bytes, + "bytes", + report=MetricReport.LOWER_IS_BETTER, + ) + + # Stop so we can restart + endpoint.stop() + + # Imitate optimizations that console would do for the second start + endpoint.respec(skip_pg_catalog_updates=True) diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index 8a9509ea44..aaa2f8fec2 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -100,6 +100,30 @@ QUERIES: Tuple[LabelledQuery, ...] = ( ) # fmt: on +# A list of pgvector HNSW index builds to run. +# Please do not alter the label for the query, as it is used to identify it. +# +# Disable auto formatting for the list of queries so that it's easier to read +# fmt: off +PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = ( + LabelledQuery("PGVPREP", r"ALTER EXTENSION VECTOR UPDATE;"), + LabelledQuery("PGV0", r"DROP TABLE IF EXISTS hnsw_test_table;"), + LabelledQuery("PGV1", r"CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;"), + LabelledQuery("PGV2", r"INSERT INTO hnsw_test_table SELECT * FROM documents;"), + LabelledQuery("PGV3", r"CREATE INDEX ON hnsw_test_table (_id);"), + LabelledQuery("PGV4", r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);"), + LabelledQuery("PGV5", r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);"), + LabelledQuery("PGV6", r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);"), + LabelledQuery("PGV7", r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);"), + LabelledQuery("PGV8", r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);"), + LabelledQuery("PGV9", r"DROP TABLE IF EXISTS halfvec_test_table;"), + LabelledQuery("PGV10", r"CREATE TABLE halfvec_test_table (_id text NOT NULL, title text, text text, embeddings halfvec(1536), PRIMARY KEY (_id));"), + LabelledQuery("PGV11", r"INSERT INTO halfvec_test_table (_id, title, text, embeddings) SELECT _id, title, text, embeddings::halfvec FROM documents;"), + LabelledQuery("PGV12", r"CREATE INDEX documents_half_precision_hnsw_idx ON halfvec_test_table USING hnsw (embeddings halfvec_cosine_ops) WITH (m = 64, ef_construction = 128);"), +) +# fmt: on + + EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)" @@ -245,3 +269,18 @@ def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare): log.info("Collecting pg_stat_statements") query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;") run_psql(remote_compare, query, times=1, explain=False) + + +@pytest.mark.parametrize("query", PGVECTOR_QUERIES) +@pytest.mark.remote_cluster +def test_pgvector_indexing(query: LabelledQuery, remote_compare: RemoteCompare): + """ + An pgvector test that tests HNSW index build performance and parallelism. + + The DB prepared manually in advance. + See + - test_runner/performance/pgvector/README.md + - test_runner/performance/pgvector/loaddata.py + - test_runner/performance/pgvector/HNSW_build.sql + """ + run_psql(remote_compare, query, times=1, explain=False) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 2b8760dff2..6eaa29e4f8 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -17,6 +17,8 @@ class PgBenchLoadType(enum.Enum): INIT = "init" SIMPLE_UPDATE = "simple-update" SELECT_ONLY = "select-only" + PGVECTOR_HNSW = "pgvector-hnsw" + PGVECTOR_HALFVEC = "pgvector-halfvec" def utc_now_timestamp() -> int: @@ -132,6 +134,46 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P password=password, ) + if workload_type == PgBenchLoadType.PGVECTOR_HNSW: + # Run simple-update workload + run_pgbench( + env, + "pgvector-hnsw", + [ + "pgbench", + "-f", + "test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql", + "-c100", + "-j20", + f"-T{duration}", + "-P2", + "--protocol=prepared", + "--progress-timestamp", + connstr, + ], + password=password, + ) + + if workload_type == PgBenchLoadType.PGVECTOR_HALFVEC: + # Run simple-update workload + run_pgbench( + env, + "pgvector-halfvec", + [ + "pgbench", + "-f", + "test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql", + "-c100", + "-j20", + f"-T{duration}", + "-P2", + "--protocol=prepared", + "--progress-timestamp", + connstr, + ], + password=password, + ) + env.report_size() diff --git a/test_runner/performance/test_perf_pgvector_queries.py b/test_runner/performance/test_perf_pgvector_queries.py new file mode 100644 index 0000000000..bb3db16305 --- /dev/null +++ b/test_runner/performance/test_perf_pgvector_queries.py @@ -0,0 +1,24 @@ +import pytest +from fixtures.compare_fixtures import PgCompare + +from performance.test_perf_pgbench import PgBenchLoadType, get_durations_matrix, run_test_pgbench + + +# The following test runs on an existing database that has pgvector extension installed +# and a table with 1 million embedding vectors loaded and indexed with HNSW. +# +# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup. +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_pgvector_hnsw(remote_compare: PgCompare, duration: int): + run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW) + + +# The following test runs on an existing database that has pgvector extension installed +# and a table with 1 million embedding vectors loaded and indexed with halfvec. +# +# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup. +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_pgvector_halfvec(remote_compare: PgCompare, duration: int): + run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HALFVEC) diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py new file mode 100644 index 0000000000..9cd83f0959 --- /dev/null +++ b/test_runner/performance/test_sharding_autosplit.py @@ -0,0 +1,280 @@ +import concurrent.futures +import re +from pathlib import Path + +import pytest +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + tenant_get_shards, +) + + +@pytest.mark.timeout(600) +def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Check that sharding, including auto-splitting, "just works" under pgbench workloads. + + This is not a benchmark, but it lives in the same place as benchmarks in order to be run + on a dedicated node that can sustain some significant throughput. + + Other tests validate the details of shard splitting, error cases etc. This test is + the sanity check that it all really works as expected with realistic amounts of data + and under load. + + Success conditions: + - Tenants auto-split when their capacity grows + - Client workloads are not interrupted while that happens + """ + + neon_env_builder.num_pageservers = 8 + neon_env_builder.storage_controller_config = { + # Split tenants at 500MB: it's up to the storage controller how it interprets this (logical + # sizes, physical sizes, etc). We will write this much data logically, therefore other sizes + # will reliably be greater. + "split_threshold": 1024 * 1024 * 500 + } + + tenant_conf = { + # We want layer rewrites to happen as soon as possible (this is the most stressful + # case for the system), so set PITR interval to something tiny. + "pitr_interval": "5s", + # Scaled down thresholds. We will run at ~1GB scale but would like to emulate + # the behavior of a system running at ~100GB scale. + "checkpoint_distance": f"{1024 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{1024 * 1024}", + "image_creation_threshold": "2", + "image_layer_creation_check_threshold": "0", + } + + env = neon_env_builder.init_start() + + for ps in env.pageservers: + ps.allowed_errors.extend( + [ + # We shut down pageservers while they might have some compaction work going on + ".*Compaction failed.*shutting down.*" + ] + ) + + env.storage_controller.allowed_errors.extend( + [ + # The neon_local functionality for updating computes is flaky for unknown reasons + ".*Local notification hook failed.*", + ".*Marking shard.*for notification retry.*", + ".*Failed to notify compute.*", + ] + ) + + # Total tenants + tenant_count = 4 + + # Transaction rate: we set this rather than running at full-speed because we + # might run on a slow node that doesn't cope well with many full-speed pgbenches running concurrently. + transaction_rate = 100 + + class TenantState: + def __init__(self, timeline_id, endpoint): + self.timeline_id = timeline_id + self.endpoint = endpoint + + # Create tenants + tenants = {} + for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)): + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf) + endpoint = env.endpoints.create("main", tenant_id=tenant_id) + tenants[tenant_id] = TenantState(timeline_id, endpoint) + endpoint.start() + + def run_pgbench_init(endpoint): + pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-i", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + def check_pgbench_output(out_path: str): + """ + When we run pgbench, we want not just an absence of errors, but also continuous evidence + of I/O progressing: our shard splitting and migration should not interrrupt the benchmark. + """ + matched_lines = 0 + stderr = Path(f"{out_path}.stderr").read_text() + + low_watermark = None + + # Apply this as a threshold for what we consider an unacceptable interruption to I/O + min_tps = transaction_rate // 10 + + for line in stderr.split("\n"): + match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .* ([0-9]+) failed", line) + if match is None: + # Fall back to older-version pgbench output (omits failure count) + match = re.match(r"progress: ([0-9\.]+) s, ([0-9\.]+) tps, .*", line) + if match is None: + continue + else: + (_time, tps) = match.groups() + tps = float(tps) + failed = 0 + else: + (_time, tps, failed) = match.groups() # type: ignore + tps = float(tps) + failed = int(failed) + + matched_lines += 1 + + if failed > 0: + raise RuntimeError( + f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has failed > 0" + ) + + if low_watermark is None or low_watermark > tps: + low_watermark = tps + + # Temporarily disabled: have seen some 0 tps regions on Hetzner runners, but not + # at the same time as a shard split. + # if tps < min_tps: + # raise RuntimeError( + # f"pgbench on tenant {endpoint.tenant_id} run at {out_path} has tps < {min_tps}" + # ) + + log.info(f"Checked {matched_lines} progress lines, lowest TPS was {min_tps}") + + if matched_lines == 0: + raise RuntimeError(f"pgbench output at {out_path} contained no progress lines") + + def run_pgbench_main(endpoint): + out_path = pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-T", + "180", + "-R", + f"{transaction_rate}", + "-P", + "1", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + check_pgbench_output(out_path) + + def run_pgbench_read(endpoint): + out_path = pg_bin.run_capture( + [ + "pgbench", + "-s50", + "-T", + "30", + "-R", + f"{transaction_rate}", + "-S", + "-P", + "1", + f"postgres://cloud_admin@localhost:{endpoint.pg_port}/postgres", + ] + ) + + check_pgbench_output(out_path) + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_init, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench inits") + for fut in pgbench_futs: + fut.result() + + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_main, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read/write pass") + for fut in pgbench_futs: + fut.result() + + def assert_all_split(): + for tenant_id in tenants.keys(): + shards = tenant_get_shards(env, tenant_id) + assert len(shards) == 8 + + # This is not a wait_until, because we wanted the splits to happen _while_ pgbench is running: otherwise + # this test is not properly doing its job of validating that splits work nicely under load. + assert_all_split() + + env.storage_controller.assert_log_contains(".*Successful auto-split.*") + + # Log timeline sizes, useful for debug, and implicitly validates that the shards + # are available in the places the controller thinks they should be. + for tenant_id, tenant_state in tenants.items(): + (shard_zero_id, shard_zero_ps) = tenant_get_shards(env, tenant_id)[0] + timeline_info = shard_zero_ps.http_client().timeline_detail( + shard_zero_id, tenant_state.timeline_id + ) + log.info(f"{shard_zero_id} timeline: {timeline_info}") + + # Run compaction for all tenants, restart endpoint so that on subsequent reads we will + # definitely hit pageserver for reads. This compaction passis expected to drop unwanted + # layers but not do any rewrites (we're still in the same generation) + for tenant_id, tenant_state in tenants.items(): + tenant_state.endpoint.stop() + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None) + shard_ps.http_client().timeline_compact(shard_id, tenant_state.timeline_id) + tenant_state.endpoint.start() + + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read pass") + for fut in pgbench_futs: + fut.result() + + env.storage_controller.consistency_check() + + # Restart the storage controller + env.storage_controller.stop() + env.storage_controller.start() + + env.storage_controller.consistency_check() + + # Restart all pageservers + for ps in env.pageservers: + ps.stop() + ps.start() + + # Freshen gc_info in Timeline, so that when compaction runs in the background in the + # subsequent pgbench period, the last_gc_cutoff is updated and enables the conditions for a rewrite to pass. + for tenant_id, tenant_state in tenants.items(): + for shard_id, shard_ps in tenant_get_shards(env, tenant_id): + shard_ps.http_client().timeline_gc(shard_id, tenant_state.timeline_id, gc_horizon=None) + + # One last check data remains readable after everything has restarted + with concurrent.futures.ThreadPoolExecutor(max_workers=tenant_count) as pgbench_threads: + pgbench_futs = [] + for tenant_state in tenants.values(): + fut = pgbench_threads.submit(run_pgbench_read, tenant_state.endpoint) + pgbench_futs.append(fut) + + log.info("Waiting for pgbench read pass") + for fut in pgbench_futs: + fut.result() + + # Assert that some rewrites happened + # TODO: uncomment this after https://github.com/neondatabase/neon/pull/7531 is merged + # assert any(ps.log_contains(".*Rewriting layer after shard split.*") for ps in env.pageservers) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py new file mode 100644 index 0000000000..cb013ae8c3 --- /dev/null +++ b/test_runner/performance/test_storage_controller_scale.py @@ -0,0 +1,201 @@ +import concurrent.futures +import random +import time + +import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.compute_reconfigure import ComputeReconfigure +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pg_version import PgVersion + + +@pytest.mark.timeout(3600) # super long running test: should go down as we optimize +def test_storage_controller_many_tenants( + neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure +): + """ + Check that we cope well with a not-totally-trivial number of tenants. + + This is checking for: + - Obvious concurrency bugs from issuing many tenant creations/modifications + concurrently. + - Obvious scaling bugs like O(N^2) scaling that would be so slow that even + a basic test starts failing from slowness. + + This is _not_ a comprehensive scale test: just a basic sanity check that + we don't fall over for a thousand shards. + """ + + neon_env_builder.num_pageservers = 5 + neon_env_builder.storage_controller_config = { + # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. + # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to + # guard against regressions in restart time. + "max_unavailable": "300s" + } + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + + # A small sleep on each call into the notify hook, to simulate the latency of doing a database write + compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) + + env = neon_env_builder.init_start() + + # We will intentionally stress reconciler concurrrency, which triggers a warning when lots + # of shards are hitting the delayed path. + env.storage_controller.allowed_errors.append(".*Many shards are waiting to reconcile") + + for ps in env.pageservers: + # This can happen because when we do a loop over all pageservers and mark them offline/active, + # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of + # bumping generation before other attachments are detached. + # + # We could clean this up by making reconcilers respect the .observed of their predecessor, if + # we spawn with a wait for the predecessor. + ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + + # Storage controller is allowed to drop pageserver requests when the cancellation token + # for a Reconciler fires. + ps.allowed_errors.append(".*request was dropped before completing.*") + + # Total tenants + tenant_count = 4000 + + # Shards per tenant + shard_count = 2 + stripe_size = 1024 + + tenants = set(TenantId.generate() for _i in range(0, tenant_count)) + + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + def check_memory(): + # Shards should be cheap_ in memory, as we will have very many of them + expect_memory_per_shard = 128 * 1024 + + rss = env.storage_controller.get_metric_value("process_resident_memory_bytes") + assert rss is not None + log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)") + assert rss < expect_memory_per_shard * shard_count * tenant_count + + # We use a fixed seed to make the test somewhat reproducible: we want a randomly + # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. + rng = random.Random(1234) + + # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore + # permits, to ensure that we are exercising stressing that. + api_concurrency = 135 + + # We will create tenants directly via API, not via neon_local, to avoid any false + # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) + with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: + futs = [] + t1 = time.time() + for tenant_id in tenants: + f = executor.submit( + env.storage_controller.tenant_create, + tenant_id, + shard_count, + stripe_size, + # Upload heatmaps fast, so that secondary downloads happen promptly, enabling + # the controller's optimization migrations to proceed promptly. + tenant_config={"heatmap_period": "10s"}, + placement_policy={"Attached": 1}, + ) + futs.append(f) + + # Wait for creations to finish + for f in futs: + f.result() + log.info( + f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s" + ) + + run_ops = api_concurrency * 4 + assert run_ops < len(tenants) + op_tenants = list(tenants)[0:run_ops] + + # Generate a mixture of operations and dispatch them all concurrently + futs = [] + for tenant_id in op_tenants: + op = rng.choice([0, 1, 2]) + if op == 0: + # A fan-out write operation to all shards in a tenant (timeline creation) + f = executor.submit( + virtual_ps_http.timeline_create, + PgVersion.NOT_SET, + tenant_id, + TimelineId.generate(), + ) + elif op == 1: + # A reconciler operation: migrate a shard. + shard_number = rng.randint(0, shard_count - 1) + tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) + dest_ps_id = rng.choice([ps.id for ps in env.pageservers]) + f = executor.submit( + env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id + ) + elif op == 2: + # A passthrough read to shard zero + f = executor.submit(virtual_ps_http.tenant_status, tenant_id) + + futs.append(f) + + # Wait for mixed ops to finish + for f in futs: + f.result() + + # Consistency check is safe here: all the previous operations waited for reconcile before completing + env.storage_controller.consistency_check() + check_memory() + + # This loop waits for reconcile_all to indicate no pending work, and then calls it once more to time + # how long the call takes when idle: this iterates over shards while doing no I/O and should be reliably fast: if + # it isn't, that's a sign that we have made some algorithmic mistake (e.g. O(N**2) scheduling) + # + # We do not require that the system is quiescent already here, although at present in this point in the test + # that may be the case. + while True: + t1 = time.time() + reconcilers = env.storage_controller.reconcile_all() + if reconcilers == 0: + # Time how long a no-op background reconcile takes: this measures how long it takes to + # loop over all the shards looking for work to do. + runtime = time.time() - t1 + log.info(f"No-op call to reconcile_all took {runtime}s") + assert runtime < 1 + break + + # Restart the storage controller + env.storage_controller.stop() + env.storage_controller.start() + + # See how long the controller takes to pass its readiness check. This should be fast because + # all the nodes are online: offline pageservers are the only thing that's allowed to delay + # startup. + readiness_period = env.storage_controller.wait_until_ready() + assert readiness_period < 5 + + # Consistency check is safe here: the storage controller's restart should not have caused any reconcilers + # to run, as it was in a stable state before restart. If it did, that's a bug. + env.storage_controller.consistency_check() + check_memory() + + # Restart pageservers: this exercises the /re-attach API + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, + # as they were not offline long enough to trigger any scheduling changes. + env.storage_controller.consistency_check() + check_memory() + + # Stop the storage controller before tearing down fixtures, because it otherwise might log + # errors trying to call our `ComputeReconfigure`. + env.storage_controller.stop() diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 7eb244d378..513ebc74c3 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -6,10 +6,10 @@ from typing import Any, Callable, List import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin -from fixtures.types import Lsn from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix diff --git a/test_runner/pg_clients/csharp/npgsql/Dockerfile b/test_runner/pg_clients/csharp/npgsql/Dockerfile index b23eb2e5eb..71717a6006 100644 --- a/test_runner/pg_clients/csharp/npgsql/Dockerfile +++ b/test_runner/pg_clients/csharp/npgsql/Dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build +FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build WORKDIR /source COPY *.csproj . @@ -7,7 +7,7 @@ RUN dotnet restore COPY . . RUN dotnet publish -c release -o /app --no-restore -FROM mcr.microsoft.com/dotnet/runtime:7.0 +FROM mcr.microsoft.com/dotnet/runtime:8.0 WORKDIR /app COPY --from=build /app . diff --git a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj index bb4427f2c4..edf2a01337 100644 --- a/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj +++ b/test_runner/pg_clients/csharp/npgsql/csharp-npgsql.csproj @@ -2,13 +2,13 @@ Exe - net7.0 + net8.0 enable enable - + diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 74eb9bdc32..7e074e07b8 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,10 +1,10 @@ -FROM openjdk:20 +FROM openjdk:21 WORKDIR /source COPY . . WORKDIR /app -RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.6.0.jar && \ +RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.7.2.jar && \ javac -d /app /source/Example.java CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"] diff --git a/test_runner/pg_clients/python/asyncpg/Dockerfile b/test_runner/pg_clients/python/asyncpg/Dockerfile index 8b6d56b8fb..f2cc37a7bb 100644 --- a/test_runner/pg_clients/python/asyncpg/Dockerfile +++ b/test_runner/pg_clients/python/asyncpg/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM python:3.12 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/asyncpg/requirements.txt b/test_runner/pg_clients/python/asyncpg/requirements.txt index b33c21474c..61972959a9 100644 --- a/test_runner/pg_clients/python/asyncpg/requirements.txt +++ b/test_runner/pg_clients/python/asyncpg/requirements.txt @@ -1 +1 @@ -asyncpg==0.27.0 +asyncpg==0.29.0 diff --git a/test_runner/pg_clients/python/pg8000/Dockerfile b/test_runner/pg_clients/python/pg8000/Dockerfile index ebef1f9059..ee1de20da5 100644 --- a/test_runner/pg_clients/python/pg8000/Dockerfile +++ b/test_runner/pg_clients/python/pg8000/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM python:3.12 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index a8407c3cb0..e086a937e6 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1,2 +1,2 @@ -pg8000==1.29.8 +pg8000==1.30.5 scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 3ac0f16e4b..a4a2426b97 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "async-trait" -version = "0.1.74" +version = "0.1.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9" +checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", @@ -51,9 +51,9 @@ dependencies = [ [[package]] name = "base64" -version = "0.21.4" +version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ba43ea6f343b788c8764558649e08df62f86c6ef251fdaeb1ffd010a9ae50a2" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "bitflags" @@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.1" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "block-buffer" @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.14.0" +version = "3.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" [[package]] name = "byteorder" @@ -96,12 +96,9 @@ checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" [[package]] name = "cc" -version = "1.0.83" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" -dependencies = [ - "libc", -] +checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723" [[package]] name = "cfg-if" @@ -111,9 +108,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "core-foundation" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "194a7a9e6de53fa55116934067c844d9d749312f75c6f6d0980e8c252f8c2146" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ "core-foundation-sys", "libc", @@ -121,15 +118,15 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" [[package]] name = "cpufeatures" -version = "0.2.9" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" dependencies = [ "libc", ] @@ -157,12 +154,12 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.5" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -200,9 +197,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "futures" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" dependencies = [ "futures-channel", "futures-core", @@ -215,9 +212,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" dependencies = [ "futures-core", "futures-sink", @@ -225,15 +222,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] name = "futures-executor" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" dependencies = [ "futures-core", "futures-task", @@ -242,15 +239,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] name = "futures-macro" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", @@ -259,21 +256,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ "futures-channel", "futures-core", @@ -299,9 +296,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "libc", @@ -310,9 +307,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.0" +version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "hmac" @@ -325,9 +322,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.64" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" dependencies = [ "wasm-bindgen", ] @@ -340,15 +337,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.149" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "linux-raw-sys" -version = "0.4.10" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" [[package]] name = "lock_api" @@ -362,9 +359,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.20" +version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "md-5" @@ -378,28 +375,28 @@ dependencies = [ [[package]] name = "memchr" -version = "2.6.4" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "miniz_oxide" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" dependencies = [ "adler", ] [[package]] name = "mio" -version = "0.8.8" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "wasi", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -422,26 +419,26 @@ dependencies = [ [[package]] name = "object" -version = "0.32.1" +version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl" -version = "0.10.60" +version = "0.10.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79a4c6c3a2b158f7f8f2a2fc5a969fa3a068df6fc9dbb4a43845436e3af7c800" +checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.4.2", "cfg-if", "foreign-types", "libc", @@ -469,9 +466,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.96" +version = "0.9.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3812c071ba60da8b5677cc12bcb1d42989a65553772897a7e0355545a819838f" +checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff" dependencies = [ "cc", "libc", @@ -497,16 +494,16 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.4.1", + "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.48.5", ] [[package]] name = "percent-encoding" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "phf" @@ -540,9 +537,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "postgres-native-tls" @@ -594,18 +591,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.69" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -640,15 +637,6 @@ dependencies = [ "getrandom", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -676,24 +664,24 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustix" -version = "0.38.19" +version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ - "bitflags 2.4.1", + "bitflags 2.4.2", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "schannel" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -753,18 +741,18 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "socket2" -version = "0.5.4" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e" +checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -786,9 +774,9 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "syn" -version = "2.0.38" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -797,15 +785,14 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.8.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.3.5", "rustix", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -825,9 +812,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.33.0" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" dependencies = [ "backtrace", "bytes", @@ -836,14 +823,14 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] name = "tokio-macros" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", @@ -888,9 +875,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" dependencies = [ "bytes", "futures-core", @@ -927,9 +914,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" [[package]] name = "unicode-ident" @@ -939,9 +926,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" dependencies = [ "tinyvec", ] @@ -965,10 +952,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] -name = "wasm-bindgen" -version = "0.2.87" +name = "wasite" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -976,9 +969,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" dependencies = [ "bumpalo", "log", @@ -991,9 +984,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1001,9 +994,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", @@ -1014,15 +1007,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.87" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] name = "web-sys" -version = "0.3.64" +version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" dependencies = [ "js-sys", "wasm-bindgen", @@ -1030,11 +1023,12 @@ dependencies = [ [[package]] name = "whoami" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50" +checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e" dependencies = [ - "wasm-bindgen", + "redox_syscall", + "wasite", "web-sys", ] @@ -1044,7 +1038,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.4", ] [[package]] @@ -1053,13 +1056,28 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +dependencies = [ + "windows_aarch64_gnullvm 0.52.4", + "windows_aarch64_msvc 0.52.4", + "windows_i686_gnu 0.52.4", + "windows_i686_msvc 0.52.4", + "windows_x86_64_gnu 0.52.4", + "windows_x86_64_gnullvm 0.52.4", + "windows_x86_64_msvc 0.52.4", ] [[package]] @@ -1068,38 +1086,80 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml index 6f100aafd5..0f420e5b06 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -9,7 +9,7 @@ publish = false [dependencies] native-tls = "0.2.11" postgres-native-tls = "0.5.0" -tokio = { version = "1.33", features=["rt", "macros"] } +tokio = { version = "1.36", features=["rt", "macros"] } tokio-postgres = "0.7.10" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index 1d3709803e..8611e66cbb 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.73 +FROM rust:1.76 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index 9538cf4ed4..0402838820 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,11 @@ -FROM swift:5.8 AS build +FROM swift:5.9 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.8 +FROM swift:5.9 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index 61e1d1bba6..9130e0973f 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,10 @@ -FROM swift:5.8 AS build +FROM swift:5.9 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.8 +FROM swift:5.9 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index 9f13106011..023e03a7b1 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -5,8 +5,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "061a0836d7c1887e04a975d1d2eaa2ef5fd7dfab", - "version" : "1.16.0" + "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f", + "version" : "1.20.2" } }, { @@ -14,8 +14,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-atomics.git", "state" : { - "revision" : "6c89474e62719ddcc1e9614989fff2f68208fe10", - "version" : "1.1.0" + "revision" : "cd142fd2f64be2100422d658e7411e39489da985", + "version" : "1.2.0" } }, { @@ -41,8 +41,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-log.git", "state" : { - "revision" : "32e8d724467f8fe623624570367e3d50c5638e46", - "version" : "1.5.2" + "revision" : "e97a6fcb1ab07462881ac165fdbb37f067e205d5", + "version" : "1.5.4" } }, { @@ -50,8 +50,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-metrics.git", "state" : { - "revision" : "9b39d811a83cf18b79d7d5513b06f8b290198b10", - "version" : "2.3.3" + "revision" : "971ba26378ab69c43737ee7ba967a896cb74c0d1", + "version" : "2.4.1" } }, { @@ -59,8 +59,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio.git", "state" : { - "revision" : "6213ba7a06febe8fef60563a4a7d26a4085783cf", - "version" : "2.54.0" + "revision" : "635b2589494c97e48c62514bc8b37ced762e0a62", + "version" : "2.63.0" } }, { @@ -68,8 +68,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-ssl.git", "state" : { - "revision" : "e866a626e105042a6a72a870c88b4c531ba05f83", - "version" : "2.24.0" + "revision" : "7c381eb6083542b124a6c18fae742f55001dc2b5", + "version" : "2.26.0" } }, { @@ -77,8 +77,17 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/apple/swift-nio-transport-services.git", "state" : { - "revision" : "41f4098903878418537020075a4d8a6e20a0b182", - "version" : "1.17.0" + "revision" : "6cbe0ed2b394f21ab0d46b9f0c50c6be964968ce", + "version" : "1.20.1" + } + }, + { + "identity" : "swift-system", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-system.git", + "state" : { + "revision" : "025bcb1165deab2e20d4eaba79967ce73013f496", + "version" : "1.2.1" } } ], diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index a80590daa2..637eb4bc9d 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.8 +// swift-tools-version:5.9 import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.16.0") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index 07e98c586b..004b383749 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20 +FROM node:21 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index 4cedf56acd..b4f8587eac 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,24 +5,24 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.5.9" + "postgresql-client": "2.10.5" } }, "node_modules/doublylinked": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.2.tgz", - "integrity": "sha512-TDh0XfQWWDrfvGdAN0hLNIdkTXlw04nVCO5B/37ie4dV0yw1iT9ZrZ6tD+q/0SwXxeI/u6TF9Mxgd7s5/XYV6A==", + "version": "2.5.4", + "resolved": "https://registry.npmjs.org/doublylinked/-/doublylinked-2.5.4.tgz", + "integrity": "sha512-jBCKDnFkEHJRjQvYEl5N9VngRV8ypHgw6a52OK4VN57eV2r2rYvgOx9uABdY78INNoW7S6auULp+KBVm/jfYqw==", "engines": { "node": ">= 10.0" } }, "node_modules/lightning-pool": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.1.tgz", - "integrity": "sha512-/pUIoGD3nzTH/wI4TYiJM3cLPeUOzGMTfFeBRuxaOAnwL0LZfwvqn5YFqsfyF98M0C3UXxWgfTz+Lu6okkno+g==", + "version": "4.2.2", + "resolved": "https://registry.npmjs.org/lightning-pool/-/lightning-pool-4.2.2.tgz", + "integrity": "sha512-KW0Df0IbjNLxy5wAsdErTKYtHGwefLRQseHNksEctyaL7gtRwJT0nqLa2uiRdNYDwKSnZtqOjSjUNtfxmfH1qw==", "dependencies": { - "doublylinked": "^2.5.2", - "putil-promisify": "^1.8.6" + "doublylinked": "^2.5.3", + "putil-promisify": "^1.10.1" } }, "node_modules/obuf": { @@ -42,16 +42,16 @@ } }, "node_modules/postgresql-client": { - "version": "2.5.9", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.5.9.tgz", - "integrity": "sha512-s+kgTN6TfWLzehEyxw4Im4odnxVRCbZ0DEJzWS6SLowPAmB2m1/DOiOvZC0+ZVoi5AfbGE6SBqFxKguSyVAXZg==", + "version": "2.10.5", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz", + "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==", "dependencies": { - "doublylinked": "^2.5.2", - "lightning-pool": "^4.2.1", + "doublylinked": "^2.5.4", + "lightning-pool": "^4.2.2", "postgres-bytea": "^3.0.0", - "power-tasks": "^1.7.0", - "putil-merge": "^3.10.3", - "putil-promisify": "^1.10.0", + "power-tasks": "^1.7.3", + "putil-merge": "^3.12.1", + "putil-promisify": "^1.10.1", "putil-varhelpers": "^1.6.5" }, "engines": { @@ -60,30 +60,29 @@ } }, "node_modules/power-tasks": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.0.tgz", - "integrity": "sha512-rndZXCDxhuIDjPUJJvQwBDHaYagCkjvbPF/NA+omh/Ef4rAI9KtnvdA0k98dyiGpn1zXOpc6c2c0JWzg/xAhJg==", + "version": "1.7.3", + "resolved": "https://registry.npmjs.org/power-tasks/-/power-tasks-1.7.3.tgz", + "integrity": "sha512-EnkjLfaX4PxFYHbUWyWzlE4I8SgctaW9jx4qQXrVRoELlqBXrxIMtuhHzRwsHv2qs1tO7efOcZa6/wDCdCjRfA==", "dependencies": { - "doublylinked": "^2.5.2", - "strict-typed-events": "^2.3.1" + "doublylinked": "^2.5.4", + "strict-typed-events": "^2.3.3" }, "engines": { - "node": ">=14.0", - "npm": ">=7.0.0" + "node": ">=16.0" } }, "node_modules/putil-merge": { - "version": "3.10.3", - "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.10.3.tgz", - "integrity": "sha512-B18CYi0/SmBYl9+fgowYWkgzJM/8XcLSeafHrFrGzwySQuOzLW0sOGx0CdFVp9zqaxgLctexUdGoSPpm6CPM6A==", + "version": "3.12.1", + "resolved": "https://registry.npmjs.org/putil-merge/-/putil-merge-3.12.1.tgz", + "integrity": "sha512-4clPyRkJPrd5zl98AP7I3JamyXbx0ixe2CnfvGwoTyWSr7Kslcv8weoKjfU4BMBifkWIRL54l4OrNe97pYcDwQ==", "engines": { "node": ">= 10.0" } }, "node_modules/putil-promisify": { - "version": "1.10.0", - "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.0.tgz", - "integrity": "sha512-zYPoAoMxmf8pC+I75kRkYkVMwU4ZbZl82aTGema175bmhQ06BEJuuOlzOy1buQK9G+hCyQ+BFpzMTKAJhD8rZw==", + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/putil-promisify/-/putil-promisify-1.10.1.tgz", + "integrity": "sha512-1jm0egJNrj5eBDRj15Cg08RNHDV91OVEHeeYjAFRcs663PXxFokndxcJAGbaO6CSErCTp8eTgC8vuOF+fvXIAA==", "engines": { "node": ">= 14.0" } @@ -97,21 +96,21 @@ } }, "node_modules/strict-typed-events": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.1.tgz", - "integrity": "sha512-Z1h8KpVbrVg34Vwy/VwTD/tS9tFebH2h1Kvw4xnPkKpkISMwUpnqwU44rMfkKMpXbFCybIgDt7ARoCGTzURZhQ==", + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/strict-typed-events/-/strict-typed-events-2.3.3.tgz", + "integrity": "sha512-Vc8/N5giCVpO2n5BCskqDD9ns7RkdEq0pFd4yQk1ROULusJDbjORNvbtyEPxxK7Xqn9/NdW8XHLxv/PvUTgFsA==", "dependencies": { - "putil-promisify": "^1.8.5", - "ts-gems": "^2.2.0" + "putil-promisify": "^1.10.1", + "ts-gems": "^3.1.0" }, "engines": { "node": ">=16.0" } }, "node_modules/ts-gems": { - "version": "2.4.0", - "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-2.4.0.tgz", - "integrity": "sha512-SdugYAXoWvbqrxLodIObzxhEKacDxh5LfAJIiIkiH7q5thvuuCzdmkdTVQYf7uEDrEpPhfx4tokDMamdO3be9A==" + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/ts-gems/-/ts-gems-3.1.1.tgz", + "integrity": "sha512-Li1Z44FnxN06c1lBwFepb932jPYT+4eOvOmoiC30lOTkvOJOERr9xZFg3UA9y19OYO9CrW3ZSqNL66DUSuwFTw==" } } } diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 12703ce89f..07ec100d0d 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.5.9" + "postgresql-client": "2.10.5" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile index 07e98c586b..004b383749 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -1,4 +1,4 @@ -FROM node:20 +FROM node:21 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index 72cc452817..f3b456f1ed 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -5,14 +5,14 @@ "packages": { "": { "dependencies": { - "@neondatabase/serverless": "0.4.18", - "ws": "8.13.0" + "@neondatabase/serverless": "0.9.0", + "ws": "8.17.1" } }, "node_modules/@neondatabase/serverless": { - "version": "0.4.18", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.4.18.tgz", - "integrity": "sha512-2TZnIyRGC/+0fjZ8TKCzaSTPUD94PM7NBGuantGZbUrbWyqBwGnUoRtdZAQ95qBKVHqORLVfymlv2NE+HQMFeA==", + "version": "0.9.0", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz", + "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==", "dependencies": { "@types/pg": "8.6.6" } @@ -96,9 +96,9 @@ } }, "node_modules/ws": { - "version": "8.13.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.13.0.tgz", - "integrity": "sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==", + "version": "8.17.1", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz", + "integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==", "engines": { "node": ">=10.0.0" }, diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 840c7a5c4c..3ae7a8a6cf 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "0.4.18", - "ws": "8.13.0" + "@neondatabase/serverless": "0.9.0", + "ws": "8.17.1" } } diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index 0e390ba9e5..7e40081aa2 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import query_scalar @@ -45,7 +45,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Create branch1. env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100) endpoint_branch1 = env.endpoints.create_start("branch1", tenant_id=tenant) - log.info("postgres is running on 'branch1' branch") branch1_cur = endpoint_branch1.connect().cursor() branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id")) @@ -68,7 +67,6 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): # Create branch2. env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200) endpoint_branch2 = env.endpoints.create_start("branch2", tenant_id=tenant) - log.info("postgres is running on 'branch2' branch") branch2_cur = endpoint_branch2.connect().cursor() branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id")) diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index ed389b1aa2..f4667a82dc 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -2,13 +2,13 @@ from dataclasses import dataclass from typing import Generator, Optional import pytest +from fixtures.common_types import TenantId from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverApiException, TenantConfig from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import TenantId from fixtures.utils import wait_until @@ -17,9 +17,13 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - # eviction might be the first one after an attach to access the layers - env.pageserver.allowed_errors.append( - ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction" + env.pageserver.allowed_errors.extend( + [ + # eviction might be the first one after an attach to access the layers + ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction", + # detach can happen before we get to validate the generation number + ".*deletion backend: Dropped remote consistent LSN updates for tenant.*", + ] ) assert isinstance(env.pageserver_remote_storage, LocalFsStorage) return env @@ -63,10 +67,11 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N ] ) - def log_contains_bad_request(): - env.pageserver.log_contains(".*Error processing HTTP request: Bad request") - - wait_until(50, 0.1, log_contains_bad_request) + wait_until( + 50, + 0.1, + lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"), + ) def test_null_body(negative_env: NegativeTests): @@ -136,7 +141,7 @@ def test_no_config(positive_env: NeonEnv, content_type: Optional[str]): ps_http.tenant_detach(tenant_id) assert tenant_id not in [TenantId(t["id"]) for t in ps_http.tenant_list()] - body = {"generation": env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id)} + body = {"generation": env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)} ps_http.post( f"{ps_http.base_url}/v1/tenant/{tenant_id}/attach", @@ -160,23 +165,38 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", + "compaction_algorithm": { + "kind": "tiered", + }, "eviction_policy": { "kind": "LayerAccessThreshold", "period": "20s", "threshold": "23h", }, "evictions_low_residence_duration_metric_threshold": "2days", - "gc_feedback": True, "gc_horizon": 23 * (1024 * 1024), "gc_period": "2h 13m", "heatmap_period": "10m", "image_creation_threshold": 7, "pitr_interval": "1m", "lagging_wal_timeout": "23m", + "lazy_slru_download": True, "max_lsn_wal_lag": 230000, "min_resident_size_override": 23, + "timeline_get_throttle": { + "task_kinds": ["PageRequestHandler"], + "fair": True, + "initial": 0, + "refill_interval": "1s", + "refill_amount": 1000, + "max": 1000, + }, "trace_read_requests": True, "walreceiver_connect_timeout": "13m", + "image_layer_creation_check_threshold": 1, + "switch_aux_file_policy": "cross-validation", + "lsn_lease_length": "1m", + "lsn_lease_length_for_ts": "5s", } ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index bd87ff3efd..035ab2796f 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -4,13 +4,13 @@ from pathlib import Path import psycopg2 import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgProtocol, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient -from fixtures.types import TenantId, TimelineId def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient): @@ -105,7 +105,7 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): # The neon_local tool generates one key pair at a hardcoded path by default. # As a preparation for our test, move the public key of the key pair into a # directory at the same location as the hardcoded path by: - # 1. moving the the file at `configured_pub_key_path` to a temporary location + # 1. moving the file at `configured_pub_key_path` to a temporary location # 2. creating a new directory at `configured_pub_key_path` # 3. moving the file from the temporary location into the newly created directory configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem" @@ -225,9 +225,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): check_pageserver(True, password=pageserver_token) - env.pageserver.allowed_errors.append( - ".*SafekeeperData scope makes no sense for Pageserver.*" - ) + env.pageserver.allowed_errors.append(".*JWT scope '.+' is ineligible for Pageserver auth.*") check_pageserver(False, password=safekeeper_token) def check_safekeeper(expect_success: bool, **conn_kwargs): diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py new file mode 100644 index 0000000000..5328aef156 --- /dev/null +++ b/test_runner/regress/test_aux_files.py @@ -0,0 +1,76 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + AuxFileStore, + NeonEnvBuilder, + logical_replication_sync, +) + + +def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg): + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + client = env.pageserver.http_client() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + tenant_config = client.tenant_config(tenant_id).effective_config + tenant_config["switch_aux_file_policy"] = AuxFileStore.V2 + client.set_tenant_config(tenant_id, tenant_config) + # aux file v2 is enabled on the write path, so for now, it should be unset (or null) + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"] + is None + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("create table t(pk integer primary key, payload integer)") + cur.execute( + "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));" + ) + cur.execute("create publication pub1 for table t, replication_example") + + # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils) + # instead of going through the full logical replication process. + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)") + vanilla_pg.safe_psql( + "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);" + ) + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + # Wait logical replication channel to be established + logical_replication_sync(vanilla_pg, endpoint) + vanilla_pg.stop() + endpoint.stop() + + with env.pageserver.http_client() as client: + # aux file v2 flag should be enabled at this point + assert ( + client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"] + == AuxFileStore.V2 + ) + with env.pageserver.http_client() as client: + tenant_config = client.tenant_config(tenant_id).effective_config + tenant_config["switch_aux_file_policy"] = "V1" + client.set_tenant_config(tenant_id, tenant_config) + # the flag should still be enabled + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ + "last_aux_file_policy" + ] + == AuxFileStore.V2 + ) + env.pageserver.restart() + with env.pageserver.http_client() as client: + # aux file v2 flag should be persisted + assert ( + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[ + "last_aux_file_policy" + ] + == AuxFileStore.V2 + ) diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py index bc3faf9271..819912dd05 100644 --- a/test_runner/regress/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -107,7 +107,6 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): # which is needed for backpressure_lsns() to work endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() - log.info("postgres is running on 'test_backpressure' branch") # setup check thread check_stop_event = threading.Event() diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index ba0624c730..82a3a05c2b 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -1,31 +1,42 @@ import random import time +import psycopg2.errors +import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +@pytest.mark.timeout(600) def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(".*simulated connection error.*") + # Enable failpoint before starting everything else up so that we exercise the retry + # on fetching basebackup pageserver_http = env.pageserver.http_client() + pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) + env.neon_cli.create_branch("test_compute_pageserver_connection_stress") endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress") - # Enable failpoint after starting everything else up so that loading initial - # basebackup doesn't fail - pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) - pg_conn = endpoint.connect() cur = pg_conn.cursor() + def execute_retry_on_timeout(query): + while True: + try: + cur.execute(query) + return + except psycopg2.errors.QueryCanceled: + log.info(f"Query '{query}' timed out - retrying") + # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point # of this test. - cur.execute("CREATE TABLE foo (t text)") - cur.execute( + execute_retry_on_timeout("CREATE TABLE foo (t text)") + execute_retry_on_timeout( """ INSERT INTO foo SELECT 'long string to consume some space' || g @@ -34,7 +45,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): ) # Verify that the table is larger than shared_buffers - cur.execute( + execute_retry_on_timeout( """ select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size from pg_settings where name = 'shared_buffers' @@ -45,16 +56,16 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) - cur.execute("SELECT count(*) FROM foo") + execute_retry_on_timeout("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) end_time = time.time() + 30 times_executed = 0 while time.time() < end_time: if random.random() < 0.5: - cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')") + execute_retry_on_timeout("INSERT INTO foo VALUES ('stas'), ('heikki')") else: - cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") + execute_retry_on_timeout("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10") cur.fetchall() times_executed += 1 log.info(f"Workload executed {times_executed} times") diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index bdc944f352..eb503ddbfa 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -2,10 +2,10 @@ import threading import time import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.http import TimelineCreate406 -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar @@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() - env.pageserver.allowed_errors.extend( - [ - ".*invalid branch start lsn: less than latest GC cutoff.*", - ".*invalid branch start lsn: less than planned GC cutoff.*", - ] - ) + error_regexes = [ + ".*invalid branch start lsn: less than latest GC cutoff.*", + ".*invalid branch start lsn: less than planned GC cutoff.*", + ] + env.pageserver.allowed_errors.extend(error_regexes) + env.storage_controller.allowed_errors.extend(error_regexes) # Disable background GC but set the `pitr_interval` to be small, so GC can delete something tenant, _ = env.neon_cli.create_tenant( diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 9879254897..0a5336f5a2 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -1,8 +1,8 @@ import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import TimelineCreate406 -from fixtures.types import Lsn, TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -11,17 +11,18 @@ from fixtures.utils import print_gc_result, query_scalar # def test_branch_behind(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) - env.pageserver.allowed_errors.extend( - [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"] - ) + error_regexes = [ + ".*invalid branch start lsn.*", + ".*invalid start lsn .* for ancestor timeline.*", + ] + env.pageserver.allowed_errors.extend(error_regexes) + env.storage_controller.allowed_errors.extend(error_regexes) # Branch at the point where only 100 rows were inserted branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") endpoint_main = env.endpoints.create_start("test_branch_behind") - log.info("postgres is running on 'test_branch_behind' branch") main_cur = endpoint_main.connect().cursor() diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 9a0b91b54e..03d6946c15 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -1,9 +1,11 @@ import random import threading import time +from concurrent.futures import ThreadPoolExecutor from typing import List import pytest +from fixtures.common_types import Lsn, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -13,7 +15,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import wait_until_tenant_active -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException @@ -84,11 +85,11 @@ def test_branching_with_pgbench( threads = [] if ty == "cascade": - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{i}", tenant_id=tenant) else: - env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", "b0", tenant_id=tenant) - endpoints.append(env.endpoints.create_start("b{}".format(i + 1), tenant_id=tenant)) + endpoints.append(env.endpoints.create_start(f"b{i + 1}", tenant_id=tenant)) threads.append( threading.Thread(target=run_pgbench, args=(endpoints[-1].connstr(),), daemon=True) @@ -347,6 +348,87 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB ps_http.timeline_detail(env.initial_tenant, branch_id) +def test_duplicate_creation(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + env.pageserver.tenant_create(env.initial_tenant) + + success_timeline = TimelineId.generate() + log.info(f"Creating timeline {success_timeline}") + ps_http = env.pageserver.http_client() + success_result = ps_http.timeline_create( + env.pg_version, env.initial_tenant, success_timeline, timeout=60 + ) + + ps_http.configure_failpoints(("timeline-creation-after-uninit", "pause")) + + def start_creating_timeline(): + log.info(f"Creating (expect failure) timeline {env.initial_timeline}") + with pytest.raises(RequestException): + ps_http.timeline_create( + env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 + ) + + t = threading.Thread(target=start_creating_timeline) + try: + t.start() + + wait_until_paused(env, "timeline-creation-after-uninit") + + # While timeline creation is in progress, trying to create a timeline + # again with the same ID should return 409 + with pytest.raises( + PageserverApiException, match="creation of timeline with the given ID is in progress" + ): + ps_http.timeline_create( + env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60 + ) + + # Creation of a timeline already successfully created is idempotent, and is not impeded by some + # other timeline creation with a different TimelineId being stuck. + repeat_result = ps_http.timeline_create( + env.pg_version, env.initial_tenant, success_timeline, timeout=60 + ) + assert repeat_result == success_result + finally: + env.pageserver.stop(immediate=True) + t.join() + + # now without a failpoint + env.pageserver.start() + + wait_until_tenant_active(ps_http, env.initial_tenant) + + with pytest.raises(PageserverApiException, match="not found"): + ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + + # The one successfully created timeline should still be there. + assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1 + + +def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + + client = env.pageserver.http_client() + + failpoint = "Timeline::find_gc_cutoffs-pausable" + + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.timeline_gc, env.initial_tenant, env.initial_timeline, None) + + wait_until_paused(env, failpoint) + + env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" + ) + + client.configure_failpoints((failpoint, "off")) + + completion.result() + + def wait_until_paused(env: NeonEnv, failpoint: str): found = False msg = f"at failpoint {failpoint}" diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 4da0ba7b20..61afd820ca 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -3,6 +3,7 @@ import os from typing import List, Tuple import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -10,17 +11,22 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) -from fixtures.types import TenantId, TimelineId +from fixtures.pg_version import PgVersion # Test restarting page server, while safekeeper and compute node keep # running. def test_local_corruption(neon_env_builder: NeonEnvBuilder): + if neon_env_builder.pageserver_get_impl == "vectored": + reconstruct_function_name = "get_values_reconstruct_data" + else: + reconstruct_function_name = "get_value_reconstruct_data" + env = neon_env_builder.init_start() env.pageserver.allowed_errors.extend( [ - ".*get_value_reconstruct_data for layer .*", + f".*{reconstruct_function_name} for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -50,14 +56,8 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): (tenant0, timeline0, pg0) = tenant_timelines[0] log.info(f"Timeline {tenant0}/{timeline0} is left intact") - (tenant1, timeline1, pg1) = tenant_timelines[1] - metadata_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/metadata" - with open(metadata_path, "w") as f: - f.write("overwritten with garbage!") - log.info(f"Timeline {tenant1}/{timeline1} got its metadata spoiled") - - (tenant2, timeline2, pg2) = tenant_timelines[2] - timeline_path = f"{env.pageserver.workdir}/tenants/{tenant2}/timelines/{timeline2}/" + (tenant1, timeline1, pg1) = tenant_timelines[2] + timeline_path = f"{env.pageserver.workdir}/tenants/{tenant1}/timelines/{timeline1}/" for filename in os.listdir(timeline_path): if filename.startswith("00000"): # Looks like a layer file. Corrupt it @@ -66,7 +66,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): with open(p, "wb") as f: f.truncate(0) f.truncate(size) - log.info(f"Timeline {tenant2}/{timeline2} got its local layer files spoiled") + log.info(f"Timeline {tenant1}/{timeline1} got its local layer files spoiled") env.pageserver.start() @@ -74,19 +74,15 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): pg0.start() assert pg0.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 - # Tenant with corrupt local metadata works: remote storage is authoritative for metadata - pg1.start() - assert pg1.safe_psql("SELECT COUNT(*) FROM t")[0][0] == 100 - # Second timeline will fail during basebackup, because the local layer file is corrupt. # It will fail when we try to read (and reconstruct) a page from it, ergo the error message. # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err: - pg2.start() + with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err: + pg1.start() log.info( - f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}" + f"As expected, compute startup failed for timeline {tenant1}/{timeline1} with corrupt layers: {err}" ) @@ -126,7 +122,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed. pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return")) with pytest.raises(Exception, match="before-checkpoint-new-timeline"): - _ = env.neon_cli.create_timeline("test_timeline_init_break_before_checkpoint", tenant_id) + _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate()) # Restart the page server env.pageserver.restart(immediate=True) @@ -160,7 +156,7 @@ def test_timeline_init_break_before_checkpoint_recreate( ] ) - env.pageserver.tenant_create(env.initial_tenant) + env.neon_cli.create_tenant(env.initial_tenant) tenant_id = env.initial_tenant timelines_dir = env.pageserver.timeline_dir(tenant_id) @@ -203,7 +199,7 @@ def test_timeline_init_break_before_checkpoint_recreate( assert timeline_id == new_timeline_id -def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder): +def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -213,10 +209,10 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) initial_timeline_dirs = [d for d in timelines_dir.iterdir()] - # Introduce failpoint when creating a new timeline uninit mark, before any other files were created - pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return")) - with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"): - _ = env.neon_cli.create_timeline("test_timeline_create_break_after_uninit_mark", tenant_id) + # Introduce failpoint when creating a new timeline, right after creating its directory + pageserver_http.configure_failpoints(("after-timeline-dir-creation", "return")) + with pytest.raises(Exception, match="after-timeline-dir-creation"): + _ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate()) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. # "New" timeline is not present in the list, allowing pageserver to retry the same request diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index adb67a579e..97ab69049d 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -85,9 +85,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): # the endpoint. Whereas the previous reconfiguration was like a healthy migration, this # is more like what happens in an unexpected pageserver failure. # - # Since we're dual-attached, need to tip-off attachment service to treat the one we're + # Since we're dual-attached, need to tip-off storage controller to treat the one we're # about to start as the attached pageserver - env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) + env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[0].id) env.pageservers[0].start() env.pageservers[1].stop() @@ -97,9 +97,9 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): assert fetchone() == (100000,) env.pageservers[0].stop() - # Since we're dual-attached, need to tip-off attachment service to treat the one we're + # Since we're dual-attached, need to tip-off storage controller to treat the one we're # about to start as the attached pageserver - env.attachment_service.attach_hook_issue(env.initial_tenant, env.pageservers[1].id) + env.storage_controller.attach_hook_issue(env.initial_tenant, env.pageservers[1].id) env.pageservers[1].start() # Test a (former) bug where a child process spins without updating its connection string diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index f22eca02cc..26e6e336b9 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -25,7 +25,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv): ] endpoint = env.endpoints.create_start("test_clog_truncate", config_lines=config) - log.info("postgres is running on test_clog_truncate branch") # Install extension containing function needed for test endpoint.safe_psql("CREATE EXTENSION neon_test_utils") @@ -62,7 +61,6 @@ def test_clog_truncate(neon_simple_env: NeonEnv): "test_clog_truncate_new", "test_clog_truncate", ancestor_start_lsn=lsn_after_truncation ) endpoint2 = env.endpoints.create_start("test_clog_truncate_new") - log.info("postgres is running on test_clog_truncate_new branch") # check that new node doesn't contain truncated segment pg_xact_0000_path_new = os.path.join(endpoint2.pg_xact_dir_path(), "0000") diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py new file mode 100644 index 0000000000..49dcb9b86a --- /dev/null +++ b/test_runner/regress/test_compaction.py @@ -0,0 +1,259 @@ +import enum +import json +import os +from typing import Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, generate_uploads_and_deletions +from fixtures.pageserver.http import PageserverApiException +from fixtures.workload import Workload + +AGGRESIVE_COMPACTION_TENANT_CONF = { + # Disable gc and compaction. The test runs compaction manually. + "gc_period": "0s", + "compaction_period": "0s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 2, +} + + +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder): + """ + This is a smoke test that compaction kicks in. The workload repeatedly churns + a small number of rows and manually instructs the pageserver to run compaction + between iterations. At the end of the test validate that the average number of + layers visited to gather reconstruct data for a given key is within the empirically + observed bounds. + """ + + # Effectively disable the page cache to rely only on image layers + # to shorten reads. + neon_env_builder.pageserver_config_override = """ +page_cache_size=10 +""" + + env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 100 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + if i % 10 == 0: + log.info(f"Running churn round {i}/{churn_rounds} ...") + + workload.churn_rows(row_count, env.pageserver.id) + ps_http.timeline_compact(tenant_id, timeline_id) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + log.info("Checking layer access metrics ...") + + layer_access_metric_names = [ + "pageserver_layers_visited_per_read_global_sum", + "pageserver_layers_visited_per_read_global_count", + "pageserver_layers_visited_per_read_global_bucket", + "pageserver_layers_visited_per_vectored_read_global_sum", + "pageserver_layers_visited_per_vectored_read_global_count", + "pageserver_layers_visited_per_vectored_read_global_bucket", + ] + + metrics = env.pageserver.http_client().get_metrics() + for name in layer_access_metric_names: + layer_access_metrics = metrics.query_all(name) + log.info(f"Got metrics: {layer_access_metrics}") + + non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum") + non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count") + if non_vectored_count.value != 0: + non_vectored_average = non_vectored_sum.value / non_vectored_count.value + else: + non_vectored_average = 0 + vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") + vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") + if vectored_count.value > 0: + assert vectored_sum.value > 0 + vectored_average = vectored_sum.value / vectored_count.value + else: + # special case: running local tests with default legacy configuration + assert vectored_sum.value == 0 + vectored_average = 0 + + log.info(f"{non_vectored_average=} {vectored_average=}") + + # The upper bound for average number of layer visits below (8) + # was chosen empirically for this workload. + assert non_vectored_average < 8 + assert vectored_average < 8 + + +# Stripe sizes in number of pages. +TINY_STRIPES = 16 +LARGE_STRIPES = 32768 + + +@pytest.mark.parametrize( + "shard_count,stripe_size", [(None, None), (4, TINY_STRIPES), (4, LARGE_STRIPES)] +) +def test_sharding_compaction( + neon_env_builder: NeonEnvBuilder, stripe_size: int, shard_count: Optional[int] +): + """ + Use small stripes, small layers, and small compaction thresholds to exercise how compaction + and image layer generation interacts with sharding. + + We are looking for bugs that might emerge from the way sharding uses sparse layer files that + only contain some of the keys in the key range covered by the layer, such as errors estimating + the size of layers that might result in too-small layer files. + """ + + compaction_target_size = 128 * 1024 + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{compaction_target_size}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly: we want to exercise image layer creation in this test. + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": 0, + } + + neon_env_builder.num_pageservers = 1 if shard_count is None else shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + initial_tenant_shard_stripe_size=stripe_size, + ) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(64) + for _i in range(0, 10): + # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1, + # these should result in image layers each time we write some data into a shard, and also shards + # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer, + # rather than asserting) + workload.churn_rows(64) + + # Assert that we got some image layers: this is important because this test's purpose is to exercise the sharding changes + # to Timeline::create_image_layers, so if we weren't creating any image layers we wouldn't be doing our job. + shard_has_image_layers = [] + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) + image_layer_sizes = {} + for layer in layer_map.historic_layers: + if layer.kind == "Image": + image_layer_sizes[layer.layer_file_name] = layer.layer_file_size + + # Pageserver should assert rather than emit an empty layer file, but double check here + assert layer.layer_file_size > 0 + + shard_has_image_layers.append(len(image_layer_sizes) > 1) + log.info(f"Shard {shard_id} image layer sizes: {json.dumps(image_layer_sizes, indent=2)}") + + if stripe_size == TINY_STRIPES: + # Checking the average size validates that our keyspace partitioning is properly respecting sharding: if + # it was not, we would tend to get undersized layers because the partitioning would overestimate the physical + # data in a keyrange. + # + # We only do this check with tiny stripes, because large stripes may not give all shards enough + # data to have statistically significant image layers + avg_size = sum(v for v in image_layer_sizes.values()) / len(image_layer_sizes) + log.info(f"Shard {shard_id} average image layer size: {avg_size}") + assert avg_size > compaction_target_size / 2 + + if stripe_size == TINY_STRIPES: + # Expect writes were scattered across all pageservers: they should all have compacted some image layers + assert all(shard_has_image_layers) + else: + # With large stripes, it is expected that most of our writes went to one pageserver, so we just require + # that at least one of them has some image layers. + assert any(shard_has_image_layers) + + # Assert that everything is still readable + workload.validate() + + +class CompactionAlgorithm(str, enum.Enum): + LEGACY = "legacy" + TIERED = "tiered" + + +@pytest.mark.parametrize( + "compaction_algorithm", [CompactionAlgorithm.LEGACY, CompactionAlgorithm.TIERED] +) +def test_uploads_and_deletions( + neon_env_builder: NeonEnvBuilder, + compaction_algorithm: CompactionAlgorithm, +): + """ + :param compaction_algorithm: the compaction algorithm to use. + """ + + tenant_conf = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + "compaction_algorithm": json.dumps({"kind": compaction_algorithm.value}), + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + # TODO remove these allowed errors + # https://github.com/neondatabase/neon/issues/7707 + # https://github.com/neondatabase/neon/issues/7759 + allowed_errors = [ + ".*/checkpoint.*rename temporary file as correct path for.*", # EEXIST + ".*delta layer created with.*duplicate values.*", + ".*assertion failed: self.lsn_range.start <= lsn.*", + ".*HTTP request handler task panicked: task.*panicked.*", + ] + if compaction_algorithm == CompactionAlgorithm.TIERED: + env.pageserver.allowed_errors.extend(allowed_errors) + + try: + generate_uploads_and_deletions(env, pageserver=env.pageserver) + except PageserverApiException as e: + log.info(f"Obtained PageserverApiException: {e}") + + # The errors occur flakily and no error is ensured to occur, + # however at least one of them occurs. + if compaction_algorithm == CompactionAlgorithm.TIERED: + found_allowed_error = any(env.pageserver.log_contains(e) for e in allowed_errors) + if not found_allowed_error: + raise Exception("None of the allowed_errors occured in the log") diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index f9d6d0a934..65649e0c0a 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -1,4 +1,5 @@ import os +import re import shutil import subprocess import tempfile @@ -7,11 +8,14 @@ from typing import List, Optional import pytest import toml +from fixtures.common_types import Lsn +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, ) +from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, @@ -19,7 +23,6 @@ from fixtures.pageserver.utils import ( ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. @@ -110,11 +113,6 @@ def test_create_snapshot( env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - pg_bin.run_capture(["pgbench", "--initialize", "--scale=10", endpoint.connstr()]) pg_bin.run_capture(["pgbench", "--time=60", "--progress=2", endpoint.connstr()]) pg_bin.run_capture( @@ -136,6 +134,7 @@ def test_create_snapshot( for sk in env.safekeepers: sk.stop() env.pageserver.stop() + env.storage_controller.stop() # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it compatibility_snapshot_dir = ( @@ -143,7 +142,12 @@ def test_create_snapshot( ) if compatibility_snapshot_dir.exists(): shutil.rmtree(compatibility_snapshot_dir) - shutil.copytree(test_output_dir, compatibility_snapshot_dir) + + shutil.copytree( + test_output_dir, + compatibility_snapshot_dir, + ignore=shutil.ignore_patterns("pg_dynshmem"), + ) @check_ondisk_data_compatibility_if_enabled @@ -223,20 +227,53 @@ def test_forward_compatibility( ) try: + # Previous version neon_local and pageserver are not aware + # of the new config. + # TODO: remove these once the previous version of neon local supports them + neon_env_builder.pageserver_get_impl = None + neon_env_builder.pageserver_validate_vectored_get = None + neon_env_builder.num_safekeepers = 3 + + # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). + # But always use the current version's neon_local binary. + # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI. + neon_env_builder.neon_binpath = compatibility_neon_bin + neon_env_builder.pg_distrib_dir = compatibility_postgres_distrib_dir + neon_env_builder.neon_local_binpath = neon_env_builder.neon_local_binpath + env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", - neon_binpath=compatibility_neon_bin, - pg_distrib_dir=compatibility_postgres_distrib_dir, ) + + # not using env.pageserver.version because it was initialized before + prev_pageserver_version_str = env.get_binary_version("pageserver") + prev_pageserver_version_match = re.search( + "Neon page server git-env:(.*) failpoints: (.*), features: (.*)", + prev_pageserver_version_str, + ) + if prev_pageserver_version_match is not None: + prev_pageserver_version = prev_pageserver_version_match.group(1) + else: + raise AssertionError( + "cannot find git hash in the version string: " + prev_pageserver_version_str + ) + + # does not include logs from previous runs + assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) + neon_env_builder.start() + # ensure the specified pageserver is running + assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) + check_neon_works( env, test_output_dir=test_output_dir, sql_dump_path=compatibility_snapshot_dir / "dump.sql", repo_dir=env.repo_dir, ) + except Exception: if breaking_changes_allowed: pytest.xfail( @@ -250,9 +287,10 @@ def test_forward_compatibility( def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): ep = env.endpoints.create_start("main") + connstr = ep.connstr() + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) - connstr = ep.connstr() pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"] ) @@ -269,14 +307,23 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r timeline_id = env.initial_timeline pg_version = env.pg_version - # Delete all files from local_fs_remote_storage except initdb.tar.zst, + # Stop endpoint while we recreate timeline + ep.stop() + + try: + pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id) + except PageserverApiException as e: + # Allow the error as we might be running the old pageserver binary + log.info(f"Got allowed error: '{e}'") + + # Delete all files from local_fs_remote_storage except initdb-preserved.tar.zst, # the file is required for `timeline_create` with `existing_initdb_timeline_id`. # # TODO: switch to Path.walk() in Python 3.12 # for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk(): for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"): for filename in filenames: - if filename != "initdb.tar.zst": + if filename != "initdb-preserved.tar.zst" and filename != "initdb.tar.zst": (Path(dirpath) / filename).unlink() timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id) @@ -287,6 +334,9 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r existing_initdb_timeline_id=timeline_id, ) + # Timeline exists again: restart the endpoint + ep.start() + pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"] ) diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py new file mode 100644 index 0000000000..dd36190fcd --- /dev/null +++ b/test_runner/regress/test_compute_catalog.py @@ -0,0 +1,34 @@ +import requests +from fixtures.neon_fixtures import NeonEnv + + +def test_compute_catalog(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_config", "empty") + + endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) + client = endpoint.http_client() + + objects = client.dbs_and_roles() + + # Assert that 'cloud_admin' role exists in the 'roles' list + assert any( + role["name"] == "cloud_admin" for role in objects["roles"] + ), "The 'cloud_admin' role is missing" + + # Assert that 'postgres' database exists in the 'databases' list + assert any( + db["name"] == "postgres" for db in objects["databases"] + ), "The 'postgres' database is missing" + + ddl = client.database_schema(database="postgres") + + assert "-- PostgreSQL database dump" in ddl + + try: + client.database_schema(database="nonexistentdb") + raise AssertionError("Expected HTTPError was not raised") + except requests.exceptions.HTTPError as e: + assert ( + e.response.status_code == 404 + ), f"Expected 404 status code, but got {e.response.status_code}" diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index 0ea5784b67..4bb7df1e6a 100644 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -1,6 +1,5 @@ from contextlib import closing -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv @@ -13,7 +12,6 @@ def test_config(neon_simple_env: NeonEnv): # change config endpoint = env.endpoints.create_start("test_config", config_lines=["log_min_messages=debug1"]) - log.info("postgres is running on test_config branch") with closing(endpoint.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 01ecc2b95f..30f8d81890 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -19,6 +19,12 @@ from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env = neon_env_builder.init_start() env.neon_cli.create_branch("test_crafted_wal_end") + env.pageserver.allowed_errors.extend( + [ + # seems like pageserver stop triggers these + ".*initial size calculation failed.*Bad state (not active).*", + ] + ) endpoint = env.endpoints.create("test_crafted_wal_end") wal_craft = WalCraft(env) diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py index 500d19cf31..f741a9fc87 100644 --- a/test_runner/regress/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -20,7 +20,6 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): env.neon_cli.create_branch("test_createdb", "empty") endpoint = env.endpoints.create_start("test_createdb") - log.info("postgres is running on 'test_createdb' branch") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch @@ -65,7 +64,6 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): env = neon_simple_env env.neon_cli.create_branch("test_dropdb", "empty") endpoint = env.endpoints.create_start("test_dropdb") - log.info("postgres is running on 'test_dropdb' branch") with endpoint.cursor() as cur: cur.execute("CREATE DATABASE foodb") diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py index f1bc405287..17d9824f52 100644 --- a/test_runner/regress/test_createuser.py +++ b/test_runner/regress/test_createuser.py @@ -1,4 +1,3 @@ -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.utils import query_scalar @@ -10,7 +9,6 @@ def test_createuser(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_createuser", "empty") endpoint = env.endpoints.create_start("test_createuser") - log.info("postgres is running on 'test_createuser' branch") with endpoint.cursor() as cur: # Cause a 'relmapper' change in the original branch diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 01aeb88bca..50da673d87 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -248,8 +248,15 @@ def test_ddl_forwarding(ddl: DdlForwardingContext): # We don't have compute_ctl, so here, so create neon_superuser here manually cur.execute("CREATE ROLE neon_superuser NOLOGIN CREATEDB CREATEROLE") - with pytest.raises(psycopg2.InternalError): - cur.execute("ALTER ROLE neon_superuser LOGIN") + # Contrary to popular belief, being superman does not make you superuser + cur.execute("CREATE ROLE superman LOGIN NOSUPERUSER PASSWORD 'jungle_man'") + + with ddl.pg.cursor(user="superman", password="jungle_man") as superman_cur: + # We allow real SUPERUSERs to ALTER neon_superuser + with pytest.raises(psycopg2.InternalError): + superman_cur.execute("ALTER ROLE neon_superuser LOGIN") + + cur.execute("ALTER ROLE neon_superuser LOGIN") with pytest.raises(psycopg2.InternalError): cur.execute("CREATE DATABASE trololobus WITH OWNER neon_superuser") @@ -289,7 +296,6 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv): # Some non-existent url config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"], ) - log.info("postgres is running on 'test_ddl_forwarding_invalid_db' branch") with endpoint.cursor() as cur: cur.execute("SET neon.forward_ddl = false") diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 9fdc4d59f5..7722828c79 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -1,22 +1,23 @@ import enum import time +from collections import Counter from dataclasses import dataclass -from typing import Any, Dict, Tuple +from typing import Any, Dict, Iterable, Tuple import pytest -import toml +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserver, PgBin, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import wait_until +from fixtures.utils import human_bytes, wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" @@ -43,17 +44,16 @@ def test_min_resident_size_override_handling( ps_http.set_tenant_config(tenant_id, {}) assert_config(tenant_id, None, default_tenant_conf_value) - env.pageserver.stop() if config_level_override is not None: - env.pageserver.start( - overrides=( - "--pageserver-config-override=tenant_config={ min_resident_size_override = " - + str(config_level_override) - + " }", - ) - ) - else: - env.pageserver.start() + + def set_min_resident_size(config): + tenant_config = config.get("tenant_config", {}) + tenant_config["min_resident_size_override"] = config_level_override + config["tenant_config"] = tenant_config + + env.pageserver.edit_config_toml(set_min_resident_size) + env.pageserver.stop() + env.pageserver.start() tenant_id, _ = env.neon_cli.create_tenant() assert_overrides(tenant_id, config_level_override) @@ -75,9 +75,15 @@ class EvictionOrder(str, enum.Enum): if self == EvictionOrder.ABSOLUTE_ORDER: return {"type": "AbsoluteAccessed"} elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: - return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}} + return { + "type": "RelativeAccessed", + "args": {"highest_layer_count_loses_first": False}, + } elif self == EvictionOrder.RELATIVE_ORDER_SPARE: - return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}} + return { + "type": "RelativeAccessed", + "args": {"highest_layer_count_loses_first": True}, + } else: raise RuntimeError(f"not implemented: {self}") @@ -91,17 +97,30 @@ class EvictionEnv: layer_size: int pgbench_init_lsns: Dict[TenantId, Lsn] - def timelines_du(self) -> Tuple[int, int, int]: + @property + def pageserver(self): + """ + Shortcut for tests that only use one pageserver. + """ + return self.neon_env.pageserver + + def timelines_du(self, pageserver: NeonPageserver) -> Tuple[int, int, int]: return poor_mans_du( - self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False + self.neon_env, + [(tid, tlid) for tid, tlid in self.timelines], + pageserver, + verbose=False, ) - def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]: + def du_by_timeline(self, pageserver: NeonPageserver) -> Dict[Tuple[TenantId, TimelineId], int]: return { - (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0] + (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], pageserver, verbose=True)[0] for tid, tlid in self.timelines } + def count_layers_per_tenant(self, pageserver: NeonPageserver) -> Dict[TenantId, int]: + return count_layers_per_tenant(pageserver, self.timelines) + def warm_up_tenant(self, tenant_id: TenantId): """ Start a read-only compute at the LSN after pgbench -i, and run pgbench -S against it. @@ -126,72 +145,98 @@ class EvictionEnv: _avg = cur.fetchone() def pageserver_start_with_disk_usage_eviction( - self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder + self, + pageserver: NeonPageserver, + period, + max_usage_pct, + min_avail_bytes, + mock_behavior, + eviction_order: EvictionOrder, ): - disk_usage_config = { - "period": period, - "max_usage_pct": max_usage_pct, - "min_avail_bytes": min_avail_bytes, - "mock_statvfs": mock_behavior, - "eviction_order": eviction_order.config(), - } + """ + Starts pageserver up with mocked statvfs setup. The startup is + problematic because of dueling initial logical size calculations + requiring layers and disk usage based task evicting. - enc = toml.TomlEncoder() + Returns after initial logical sizes are complete, but the phase of disk + usage eviction task is unknown; it might need to run one more iteration + before assertions can be made. + """ - self.neon_env.pageserver.start( - overrides=( - "--pageserver-config-override=disk_usage_based_eviction=" - + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + # these can sometimes happen during startup before any tenants have been + # loaded, so nothing can be evicted, we just wait for next iteration which + # is able to evict. + pageserver.allowed_errors.append(".*WARN.* disk usage still high.*") + + pageserver.patch_config_toml_nonrecursive( + { + "disk_usage_based_eviction": { + "period": period, + "max_usage_pct": max_usage_pct, + "min_avail_bytes": min_avail_bytes, + "mock_statvfs": mock_behavior, + "eviction_order": eviction_order.config(), + }, # Disk usage based eviction runs as a background task. # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup. # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages. # But, we only have a 10-second-timeout in this test. # So, disable the delay for this test. - "--pageserver-config-override=background_task_maximum_delay='0s'", - ), + "background_task_maximum_delay": "0s", + } ) - def statvfs_called(): - assert self.neon_env.pageserver.log_contains(".*running mocked statvfs.*") + pageserver.start() + # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction + for tenant_id, timeline_id in self.timelines: + tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id) + # Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test + if tenant_ps is not None: + tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id) + + def statvfs_called(): + pageserver.assert_log_contains(".*running mocked statvfs.*") + + # we most likely have already completed multiple runs wait_until(10, 1, statvfs_called) - # these can sometimes happen during startup before any tenants have been - # loaded, so nothing can be evicted, we just wait for next iteration which - # is able to evict. - self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*") + +def count_layers_per_tenant( + pageserver: NeonPageserver, timelines: Iterable[Tuple[TenantId, TimelineId]] +) -> Dict[TenantId, int]: + ret: Counter[TenantId] = Counter() + + for tenant_id, timeline_id in timelines: + timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id) + assert timeline_dir.exists() + for file in timeline_dir.iterdir(): + if "__" not in file.name: + continue + ret[tenant_id] += 1 + + return dict(ret) -def human_bytes(amt: float) -> str: - suffixes = ["", "Ki", "Mi", "Gi"] - - last = suffixes[-1] - - for name in suffixes: - if amt < 1024 or name == last: - return f"{int(round(amt))} {name}B" - amt = amt / 1024 - - raise RuntimeError("unreachable") - - -@pytest.fixture -def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv: +def _eviction_env( + request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, num_pageservers: int +) -> EvictionEnv: """ Creates two tenants, one somewhat larger than the other. """ log.info(f"setting up eviction_env for test {request.node.name}") + neon_env_builder.num_pageservers = num_pageservers neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) # initial tenant will not be present on this pageserver env = neon_env_builder.init_configs() env.start() - pageserver_http = env.pageserver.http_client() # allow because we are invoking this manually; we always warn on executing disk based eviction - env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*") + for ps in env.pageservers: + ps.allowed_errors.append(r".* running disk usage based eviction due to pressure.*") # Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers. # Large count of layers and small layer size is good for testing because it makes evictions predictable. @@ -204,21 +249,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev timelines = [] for scale in pgbench_scales: - tenant_id, timeline_id = env.neon_cli.create_tenant( - conf={ - "gc_period": "0s", - "compaction_period": "0s", - "checkpoint_distance": f"{layer_size}", - "image_creation_threshold": "100", - "compaction_target_size": f"{layer_size}", - } - ) - - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()]) - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - timelines.append((tenant_id, timeline_id)) + timelines.append(pgbench_init_tenant(layer_size, scale, env, pg_bin)) # stop the safekeepers to avoid on-demand downloads caused by # initial logical size calculation triggered by walreceiver connection status @@ -227,23 +258,13 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev # after stopping the safekeepers, we know that no new WAL will be coming in for tenant_id, timeline_id in timelines: - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) - tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) - assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] - assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] - pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"]) - - layers = pageserver_http.layer_map_info(tenant_id, timeline_id) - log.info(f"{layers}") - assert ( - len(layers.historic_layers) >= 10 - ), "evictions happen at layer granularity, but we often assert at byte-granularity" + pgbench_init_lsns[tenant_id] = finish_tenant_creation(env, tenant_id, timeline_id, 10) eviction_env = EvictionEnv( timelines=timelines, neon_env=env, - pageserver_http=pageserver_http, + # this last tenant http client works for num_pageservers=1 + pageserver_http=env.get_tenant_pageserver(timelines[-1][0]).http_client(), layer_size=layer_size, pg_bin=pg_bin, pgbench_init_lsns=pgbench_init_lsns, @@ -252,6 +273,63 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev return eviction_env +def pgbench_init_tenant( + layer_size: int, scale: int, env: NeonEnv, pg_bin: PgBin +) -> Tuple[TenantId, TimelineId]: + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{layer_size}", + "image_creation_threshold": "999999", + "compaction_target_size": f"{layer_size}", + } + ) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()]) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + + return (tenant_id, timeline_id) + + +def finish_tenant_creation( + env: NeonEnv, + tenant_id: TenantId, + timeline_id: TimelineId, + min_expected_layers: int, +) -> Lsn: + pageserver_http = env.get_tenant_pageserver(tenant_id).http_client() + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) + tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) + assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] + assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] + pgbench_init_lsn = Lsn(tl_info["last_record_lsn"]) + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + # log.info(f"{layers}") + assert ( + len(layers.historic_layers) >= min_expected_layers + ), "evictions happen at layer granularity, but we often assert at byte-granularity" + + return pgbench_init_lsn + + +@pytest.fixture +def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv: + return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=1) + + +@pytest.fixture +def eviction_env_ha(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv: + """ + Variant of the eviction environment with two pageservers for testing eviction on + HA configurations with a secondary location. + """ + return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=2) + + def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): env = eviction_env @@ -264,10 +342,16 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): healthy_tenant_id, healthy_timeline_id = env.timelines[1] broken_size_pre, _, _ = poor_mans_du( - env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True + env.neon_env, + [(broken_tenant_id, broken_timeline_id)], + env.pageserver, + verbose=True, ) healthy_size_pre, _, _ = poor_mans_du( - env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True + env.neon_env, + [(healthy_tenant_id, healthy_timeline_id)], + env.pageserver, + verbose=True, ) # try to evict everything, then validate that broken tenant wasn't touched @@ -277,10 +361,16 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): log.info(f"{response}") broken_size_post, _, _ = poor_mans_du( - env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True + env.neon_env, + [(broken_tenant_id, broken_timeline_id)], + env.pageserver, + verbose=True, ) healthy_size_post, _, _ = poor_mans_du( - env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True + env.neon_env, + [(healthy_tenant_id, healthy_timeline_id)], + env.pageserver, + verbose=True, ) assert broken_size_pre == broken_size_post, "broken tenant should not be touched" @@ -302,7 +392,7 @@ def test_pageserver_evicts_until_pressure_is_relieved( env = eviction_env pageserver_http = env.pageserver_http - (total_on_disk, _, _) = env.timelines_du() + (total_on_disk, _, _) = env.timelines_du(env.pageserver) target = total_on_disk // 2 @@ -311,7 +401,7 @@ def test_pageserver_evicts_until_pressure_is_relieved( ) log.info(f"{response}") - (later_total_on_disk, _, _) = env.timelines_du() + (later_total_on_disk, _, _) = env.timelines_du(env.pageserver) actual_change = total_on_disk - later_total_on_disk @@ -336,8 +426,8 @@ def test_pageserver_respects_overridden_resident_size( env = eviction_env ps_http = env.pageserver_http - (total_on_disk, _, _) = env.timelines_du() - du_by_timeline = env.du_by_timeline() + (total_on_disk, _, _) = env.timelines_du(env.pageserver) + du_by_timeline = env.du_by_timeline(env.pageserver) log.info("du_by_timeline: %s", du_by_timeline) assert len(du_by_timeline) == 2, "this test assumes two tenants" @@ -379,8 +469,8 @@ def test_pageserver_respects_overridden_resident_size( GLOBAL_LRU_LOG_LINE, ), "this test is pointless if it fell back to global LRU" - (later_total_on_disk, _, _) = env.timelines_du() - later_du_by_timeline = env.du_by_timeline() + (later_total_on_disk, _, _) = env.timelines_du(env.pageserver) + later_du_by_timeline = env.du_by_timeline(env.pageserver) log.info("later_du_by_timeline: %s", later_du_by_timeline) actual_change = total_on_disk - later_total_on_disk @@ -412,7 +502,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E env = eviction_env ps_http = env.pageserver_http - (total_on_disk, _, _) = env.timelines_du() + (total_on_disk, _, _) = env.timelines_du(env.pageserver) target = total_on_disk response = ps_http.disk_usage_eviction_run( @@ -420,13 +510,13 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E ) log.info(f"{response}") - (later_total_on_disk, _, _) = env.timelines_du() + (later_total_on_disk, _, _) = env.timelines_du(env.pageserver) actual_change = total_on_disk - later_total_on_disk assert 0 <= actual_change, "nothing can load layers during this test" assert actual_change >= target, "eviction must always evict more than target" time.sleep(1) # give log time to flush - assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE) + env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE) env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) @@ -448,8 +538,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): env = eviction_env ps_http = env.pageserver_http - (total_on_disk, _, _) = env.timelines_du() - du_by_timeline = env.du_by_timeline() + (total_on_disk, _, _) = env.timelines_du(env.pageserver) + du_by_timeline = env.du_by_timeline(env.pageserver) + tenant_layers = env.count_layers_per_tenant(env.pageserver) # pick smaller or greater (iteration order is insertion order of scale=4 and scale=6) [warm, cold] = list(du_by_timeline.keys()) @@ -467,12 +558,12 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): ) log.info(f"{response}") - (later_total_on_disk, _, _) = env.timelines_du() + (later_total_on_disk, _, _) = env.timelines_du(env.pageserver) actual_change = total_on_disk - later_total_on_disk assert 0 <= actual_change, "nothing can load layers during this test" assert actual_change >= target, "eviction must always evict more than target" - later_du_by_timeline = env.du_by_timeline() + later_du_by_timeline = env.du_by_timeline(env.pageserver) for tenant, later_tenant_usage in later_du_by_timeline.items(): assert ( later_tenant_usage < du_by_timeline[tenant] @@ -503,12 +594,111 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): cold_size < cold_upper ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" else: - # just go with the space was freed, find proper limits later - pass + # with relative order what matters is the amount of layers, with a + # fudge factor of whether the eviction bothers tenants with highest + # layer count the most. last accessed times between tenants does not + # matter. + layers_now = env.count_layers_per_tenant(env.pageserver) + + expected_ratio = later_total_on_disk / total_on_disk + log.info( + f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" + ) + + for tenant_id, original_count in tenant_layers.items(): + count_now = layers_now[tenant_id] + ratio = count_now / original_count + abs_diff = abs(ratio - expected_ratio) + assert original_count > count_now + + expectation = 0.06 + log.info( + f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" + ) + # in this test case both relative_spare and relative_equal produce + # the same outcomes; this must be a quantization effect of similar + # sizes (-s4 and -s6) and small (5MB) layer size. + # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 + assert abs_diff < expectation + + +@pytest.mark.parametrize( + "order", + [ + EvictionOrder.ABSOLUTE_ORDER, + EvictionOrder.RELATIVE_ORDER_EQUAL, + EvictionOrder.RELATIVE_ORDER_SPARE, + ], +) +def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, order: EvictionOrder): + """ + Create in order first smaller tenants and finally a single larger tenant. + Assert that with relative order modes, the disk usage based eviction is + more fair towards the smaller tenants. + """ + env = neon_env_builder.init_configs() + env.start() + env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*") + + # initial_tenant and initial_timeline do not exist + + # create N tenants the same fashion as EvictionEnv + layer_size = 5 * 1024**2 + timelines = [] + for scale in [1, 1, 1, 4]: + timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale)) + + env.neon_cli.safekeeper_stop() + + for (tenant_id, timeline_id), scale in timelines: + min_expected_layers = 4 if scale == 1 else 10 + finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers) + + tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines)) + (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, True) + + response = env.pageserver.http_client().disk_usage_eviction_run( + {"evict_bytes": total_on_disk // 5, "eviction_order": order.config()} + ) + log.info(f"{response}") + + after_tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines)) + + ratios = [] + for i, ((tenant_id, _timeline_id), _scale) in enumerate(timelines): + # we expect the oldest to suffer most + originally, after = tenant_layers[tenant_id], after_tenant_layers[tenant_id] + log.info(f"{i + 1}th tenant went from {originally} -> {after}") + ratio = after / originally + ratios.append(ratio) + + assert ( + len(ratios) == 4 + ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order" + log.info(f"{ratios}") + + if order == EvictionOrder.ABSOLUTE_ORDER: + # first tenant loses most + assert ratios[0] <= ratios[1], "first should lose the most" + assert ratios[1] < ratios[2], "second should lose some" + assert ratios[1] < 1.0 + assert ratios[2] <= ratios[3], "third might not lose" + assert ratios[3] == 1.0, "tenant created last does not lose" + elif order == EvictionOrder.RELATIVE_ORDER_EQUAL: + assert all([x for x in ratios if x < 1.0]), "all tenants lose layers" + elif order == EvictionOrder.RELATIVE_ORDER_SPARE: + # with different layer sizes and pg versions, there are different combinations + assert len([x for x in ratios if x < 1.0]) >= 2, "require 2..4 tenants to lose layers" + assert ratios[3] < 1.0, "largest tenant always loses layers" + else: + raise RuntimeError(f"unimplemented {order}") def poor_mans_du( - env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False + env: NeonEnv, + timelines: Iterable[Tuple[TenantId, TimelineId]], + pageserver: NeonPageserver, + verbose: bool = False, ) -> Tuple[int, int, int]: """ Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples; @@ -518,7 +708,7 @@ def poor_mans_du( largest_layer = 0 smallest_layer = None for tenant_id, timeline_id in timelines: - timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id) + timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id) assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}" total = 0 for file in timeline_dir.iterdir(): @@ -549,6 +739,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): env = eviction_env env.neon_env.pageserver.stop() env.pageserver_start_with_disk_usage_eviction( + env.pageserver, period="1s", max_usage_pct=90, min_avail_bytes=0, @@ -559,7 +750,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) - assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") + env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO") env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO") @@ -573,11 +764,12 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): env.neon_env.pageserver.stop() # make it seem like we're at 100% utilization by setting total bytes to the used bytes - total_size, _, _ = env.timelines_du() + total_size, _, _ = env.timelines_du(env.pageserver) blocksize = 512 total_blocks = (total_size + (blocksize - 1)) // blocksize env.pageserver_start_with_disk_usage_eviction( + env.pageserver, period="1s", max_usage_pct=33, min_avail_bytes=0, @@ -592,14 +784,15 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) - def relieved_log_message(): - assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + wait_until( + 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + ) - wait_until(10, 1, relieved_log_message) + def less_than_max_usage_pct(): + post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage" - post_eviction_total_size, _, _ = env.timelines_du() - - assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage" + wait_until(2, 2, less_than_max_usage_pct) def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): @@ -612,13 +805,14 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): env.neon_env.pageserver.stop() # make it seem like we're at 100% utilization by setting total bytes to the used bytes - total_size, _, _ = env.timelines_du() + total_size, _, _ = env.timelines_du(env.pageserver) blocksize = 512 total_blocks = (total_size + (blocksize - 1)) // blocksize min_avail_bytes = total_size // 3 env.pageserver_start_with_disk_usage_eviction( + env.pageserver, period="1s", max_usage_pct=100, min_avail_bytes=min_avail_bytes, @@ -633,13 +827,61 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) - def relieved_log_message(): - assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + wait_until( + 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + ) - wait_until(10, 1, relieved_log_message) + def more_than_min_avail_bytes_freed(): + post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) + assert ( + total_size - post_eviction_total_size >= min_avail_bytes + ), f"we requested at least {min_avail_bytes} worth of free space" - post_eviction_total_size, _, _ = env.timelines_du() + wait_until(2, 2, more_than_min_avail_bytes_freed) + + +def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): + env = eviction_env_ha + + tenant_ids = [t[0] for t in env.timelines] + + # Set up a situation where one pageserver _only_ has secondary locations on it, + # so that when we release space we are sure it is via secondary locations. + log.info("Setting up secondary locations...") + ps_secondary = env.neon_env.pageservers[1] + for tenant_id in tenant_ids: + # Find where it is attached + pageserver = env.neon_env.get_tenant_pageserver(tenant_id) + pageserver.http_client().tenant_heatmap_upload(tenant_id) + + # Detach it + pageserver.tenant_detach(tenant_id) + + # Create a secondary mode location for the tenant, all tenants on one pageserver that will only + # contain secondary locations: this is the one where we will exercise disk usage eviction + ps_secondary.tenant_location_configure( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + }, + ) + readback_conf = ps_secondary.read_tenant_location_conf(tenant_id) + log.info(f"Read back conf: {readback_conf}") + + # Request secondary location to download all layers that the attached location indicated + # in its heatmap + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + total_size, _, _ = env.timelines_du(ps_secondary) + evict_bytes = total_size // 3 + + response = ps_secondary.http_client().disk_usage_eviction_run({"evict_bytes": evict_bytes}) + log.info(f"{response}") + + post_eviction_total_size, _, _ = env.timelines_du(ps_secondary) assert ( - total_size - post_eviction_total_size >= min_avail_bytes - ), "we requested at least min_avail_bytes worth of free space" + total_size - post_eviction_total_size >= evict_bytes + ), "we requested at least evict_bytes worth of free space" diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py deleted file mode 100644 index 224e6f50c7..0000000000 --- a/test_runner/regress/test_duplicate_layers.py +++ /dev/null @@ -1,147 +0,0 @@ -import time - -import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn -from fixtures.pageserver.utils import ( - wait_for_last_record_lsn, - wait_for_upload_queue_empty, - wait_until_tenant_active, -) -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from requests.exceptions import ConnectionError - - -def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() - - # use a failpoint to return all L0s as L1s - message = ".*duplicated L1 layer layer=.*" - env.pageserver.allowed_errors.append(message) - - # Use aggressive compaction and checkpoint settings - tenant_id, _ = env.neon_cli.create_tenant( - conf={ - "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", - "compaction_period": "5 s", - "compaction_threshold": "3", - } - ) - - pageserver_http.configure_failpoints(("compact-level0-phase1-return-same", "return")) - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - connstr = endpoint.connstr(options="-csynchronous_commit=off") - pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) - - time.sleep(10) # let compaction to be performed - assert env.pageserver.log_contains("compact-level0-phase1-return-same") - - -def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - """ - Test sets fail point at the end of first compaction phase: after - flushing new L1 layer but before deletion of L0 layers. - - The L1 used to be overwritten, but with crash-consistency via remote - index_part.json, we end up deleting the not yet uploaded L1 layer on - startup. - """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start( - initial_tenant_conf={ - "checkpoint_distance": f"{1024 ** 2}", - "compaction_target_size": f"{1024 ** 2}", - "compaction_period": "0 s", - "compaction_threshold": "3", - } - ) - pageserver_http = env.pageserver.http_client() - - tenant_id, timeline_id = env.initial_tenant, env.initial_timeline - - pageserver_http.configure_failpoints(("after-timeline-compacted-first-L1", "exit")) - - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - connstr = endpoint.connstr(options="-csynchronous_commit=off") - pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) - - lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - endpoint.stop() - - # make sure we receive no new wal after this, so that we'll write over the same L1 file. - endpoint.stop() - for sk in env.safekeepers: - sk.stop() - - # hit the exit failpoint - with pytest.raises(ConnectionError, match="Remote end closed connection without response"): - pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - env.pageserver.stop() - - # now the duplicate L1 has been created, but is not yet uploaded - assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - - # path = env.remote_storage.timeline_path(tenant_id, timeline_id) - l1_found = None - for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir(): - if path.name == "metadata" or path.name.startswith("ephemeral-"): - continue - - if len(path.suffixes) > 0: - # temp files - continue - - [key_range, lsn_range] = path.name.split("__", maxsplit=1) - - if "-" not in lsn_range: - # image layer - continue - - [key_start, key_end] = key_range.split("-", maxsplit=1) - - if key_start == "0" * 36 and key_end == "F" * 36: - # L0 - continue - - if l1_found is not None: - raise RuntimeError(f"found multiple L1: {l1_found.name} and {path.name}") - l1_found = path - - assert l1_found is not None, "failed to find L1 locally" - - uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name - ) - assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded" - - env.pageserver.start() - wait_until_tenant_active(pageserver_http, tenant_id) - - assert not l1_found.exists(), "partial compaction result should had been removed during startup" - - # wait for us to catch up again - wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) - - pageserver_http.timeline_compact(tenant_id, timeline_id) - - # give time for log flush - time.sleep(1) - - message = f".*duplicated L1 layer layer={l1_found.name}" - found_msg = env.pageserver.log_contains(message) - # resident or evicted, it should not be overwritten, however it should had been non-existing at startup - assert ( - found_msg is None - ), "layer should had been removed during startup, did it live on as evicted?" - - assert l1_found.exists(), "the L1 reappears" - - wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) - - uploaded = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, l1_found.name - ) - assert uploaded.exists(), "the L1 is uploaded" diff --git a/test_runner/regress/test_explain_with_lfc_stats.py b/test_runner/regress/test_explain_with_lfc_stats.py new file mode 100644 index 0000000000..5231dedcda --- /dev/null +++ b/test_runner/regress/test_explain_with_lfc_stats.py @@ -0,0 +1,84 @@ +from pathlib import Path + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv + + +def test_explain_with_lfc_stats(neon_simple_env: NeonEnv): + env = neon_simple_env + + cache_dir = Path(env.repo_dir) / "file_cache" + cache_dir.mkdir(exist_ok=True) + + branchname = "test_explain_with_lfc_stats" + env.neon_cli.create_branch(branchname, "empty") + log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + endpoint = env.endpoints.create_start( + branchname, + config_lines=[ + "shared_buffers='1MB'", + f"neon.file_cache_path='{cache_dir}/file.cache'", + "neon.max_file_cache_size='128MB'", + "neon.file_cache_size_limit='64MB'", + ], + ) + + cur = endpoint.connect().cursor() + + log.info(f"preparing some data in {endpoint.connstr()}") + + ddl = """ +CREATE TABLE pgbench_accounts ( + aid bigint NOT NULL, + bid integer, + abalance integer, + filler character(84), + -- more web-app like columns + text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5), + jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb +) +WITH (fillfactor='100'); +""" + + cur.execute(ddl) + cur.execute( + "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;" + ) + + log.info(f"warming up caches with sequential scan in {endpoint.connstr()}") + cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0") + + log.info("running explain analyze without LFC values to verify they do not show up in the plan") + cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0") + rows = cur.fetchall() + plan = "\n".join(r[0] for r in rows) + log.debug(plan) + assert "Seq Scan on pgbench_accounts" in plan + assert "Buffers: shared hit" in plan + assert "File cache: hits=" not in plan + log.info("running explain analyze WITH LFC values to verify they do now show up") + cur.execute( + "EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0" + ) + rows = cur.fetchall() + plan = "\n".join(r[0] for r in rows) + log.debug(plan) + assert "Seq Scan on pgbench_accounts" in plan + assert "Buffers: shared hit" in plan + assert "File cache: hits=" in plan + log.info("running explain analyze WITH LFC values to verify json output") + cur.execute( + "EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0" + ) + jsonplan = cur.fetchall()[0][0] + log.debug(jsonplan) + # Directly access the 'Plan' part of the first element of the JSON array + plan_details = jsonplan[0]["Plan"] + + # Extract "File Cache Hits" and "File Cache Misses" + file_cache_hits = plan_details.get("File Cache Hits") + file_cache_misses = plan_details.get("File Cache Misses") + + # Now you can assert the values + assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}" + assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}" diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index a456c06862..e6d51a77a6 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -1,6 +1,7 @@ import os from pathlib import Path +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, @@ -8,7 +9,6 @@ from fixtures.neon_fixtures import ( VanillaPostgres, ) from fixtures.port_distributor import PortDistributor -from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture num_rows = 1000 @@ -19,18 +19,16 @@ def test_fullbackup( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, port_distributor: PortDistributor, - pg_distrib_dir: Path, test_output_dir: Path, ): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_fullbackup") - endpoint_main = env.endpoints.create_start("test_fullbackup") - log.info("postgres is running on 'test_fullbackup' branch") + # endpoint needs to be alive until the fullbackup so that we have + # prev_record_lsn for the vanilla_pg to start in read-write mode + # for some reason this does not happen if endpoint is shutdown. + endpoint_main = env.endpoints.create_start("main") with endpoint_main.cursor() as cur: - timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id")) - # data loading may take a while, so increase statement timeout cur.execute("SET statement_timeout='300s'") cur.execute( @@ -42,17 +40,13 @@ def test_fullbackup( lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) log.info(f"start_backup_lsn = {lsn}") - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Get and unpack fullbackup from pageserver restored_dir_path = env.repo_dir / "restored_datadir" os.mkdir(restored_dir_path, 0o750) - query = f"fullbackup {env.initial_tenant} {timeline} {lsn}" tar_output_file = test_output_dir / "fullbackup.tar" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, env.initial_timeline, lsn, tar_output_file + ) subprocess_capture( env.repo_dir, ["tar", "-xf", str(tar_output_file), "-C", str(restored_dir_path)] ) @@ -62,17 +56,11 @@ def test_fullbackup( # use resetwal to overwrite it pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") cmd = [pg_resetwal_path, "-D", str(restored_dir_path)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.run_capture(cmd) # Restore from the backup and find the data we inserted port = port_distributor.get_port() with VanillaPostgres(restored_dir_path, pg_bin, port, init=False) as vanilla_pg: - # TODO make port an optional argument - vanilla_pg.configure( - [ - f"port={port}", - ] - ) vanilla_pg.start() num_rows_found = vanilla_pg.safe_psql("select count(*) from tbl;", user="cloud_admin")[0][0] assert num_rows == num_rows_found diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index ef68049ee7..44133f2350 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -2,6 +2,7 @@ import asyncio import concurrent.futures import random +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -10,7 +11,6 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TimelineId # Test configuration # @@ -67,11 +67,9 @@ async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId): # def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) timeline = env.neon_cli.create_branch("test_gc_aggressive", "main") endpoint = env.endpoints.create_start("test_gc_aggressive") - log.info("postgres is running on test_gc_aggressive branch") with endpoint.cursor() as cur: # Create table, and insert the first 100 rows @@ -95,13 +93,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # def test_gc_index_upload(neon_env_builder: NeonEnvBuilder): - # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" num_index_uploads = 0 neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start() + # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main") endpoint = env.endpoints.create_start("test_gc_index_upload") diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py deleted file mode 100644 index 284a8c3563..0000000000 --- a/test_runner/regress/test_gc_cutoff.py +++ /dev/null @@ -1,47 +0,0 @@ -import subprocess - -import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin - - -# Test gc_cutoff -# -# This test sets fail point at the end of GC, and checks that pageserver -# normally restarts after it. Also, there should be GC ERRORs in the log, -# but the fixture checks the log for any unexpected ERRORs after every -# test anyway, so it doesn't need any special attention here. -@pytest.mark.timeout(600) -def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): - env = neon_env_builder.init_start( - initial_tenant_conf={ - "gc_period": "10 s", - "gc_horizon": f"{1024 ** 2}", - "checkpoint_distance": f"{1024 ** 2}", - "compaction_period": "5 s", - # set PITR interval to be small, so we can do GC - "pitr_interval": "1 s", - "compaction_threshold": "3", - "image_creation_threshold": "2", - } - ) - - pageserver_http = env.pageserver.http_client() - - # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test - tenant_id = env.initial_tenant - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) - connstr = endpoint.connstr(options="-csynchronous_commit=off") - pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) - - pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit")) - - # Because this test does a rapid series of restarts of the same node, it's possible that - # we are restarted again before we can clean up deletion lists form the previous generation, - # resulting in a subsequent startup logging a warning. - env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*") - - for _ in range(5): - with pytest.raises(subprocess.SubprocessError): - pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr]) - env.pageserver.stop() - env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"}) diff --git a/test_runner/regress/test_gin_redo.py b/test_runner/regress/test_gin_redo.py new file mode 100644 index 0000000000..9205882239 --- /dev/null +++ b/test_runner/regress/test_gin_redo.py @@ -0,0 +1,22 @@ +import time + +from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup + + +# +# Test that redo of XLOG_GIN_VACUUM_PAGE doesn't produce error +# +def test_gin_redo(neon_simple_env: NeonEnv): + env = neon_simple_env + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + time.sleep(1) + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + con = primary.connect() + cur = con.cursor() + cur.execute("create table gin_test_tbl(id integer, i int4[])") + cur.execute("create index gin_test_idx on gin_test_tbl using gin (i)") + cur.execute("insert into gin_test_tbl select g,array[3, 1, g] from generate_series(1, 10000) g") + cur.execute("delete from gin_test_tbl where id % 2 = 0") + cur.execute("vacuum gin_test_tbl") + wait_replica_caughtup(primary, secondary) diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 7822e29ed9..8edc8c554c 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,38 +1,20 @@ +import asyncio import os -import re +import threading import time +from functools import partial +import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import Endpoint, NeonEnv - - -def wait_caughtup(primary: Endpoint, secondary: Endpoint): - primary_lsn = primary.safe_psql_scalar( - "SELECT pg_current_wal_insert_lsn()::text", log_query=False - ) - while True: - secondary_lsn = secondary.safe_psql_scalar( - "SELECT pg_last_wal_replay_lsn()", log_query=False - ) - caught_up = secondary_lsn >= primary_lsn - log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}") - if caught_up: - return - time.sleep(1) - - -# Check for corrupted WAL messages which might otherwise go unnoticed if -# reconnection fixes this. -def scan_standby_log_for_errors(secondary): - log_path = secondary.endpoint_path() / "compute.log" - with log_path.open("r") as f: - markers = re.compile( - r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr" - ) - for line in f: - if markers.search(line): - log.info(f"bad error in standby log: {line}") - raise AssertionError() +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + log_replica_lag, + tenant_get_shards, + wait_replica_caughtup, +) +from fixtures.utils import wait_until def test_hot_standby(neon_simple_env: NeonEnv): @@ -79,7 +61,7 @@ def test_hot_standby(neon_simple_env: NeonEnv): primary.safe_psql("create table t(key int, value text)") primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'") - wait_caughtup(primary, secondary) + wait_replica_caughtup(primary, secondary) with secondary.connect() as s_con: with s_con.cursor() as s_cur: @@ -94,8 +76,268 @@ def test_hot_standby(neon_simple_env: NeonEnv): assert response is not None assert response == responses[query] - scan_standby_log_for_errors(secondary) + # Check for corrupted WAL messages which might otherwise go unnoticed if + # reconnection fixes this. + assert not secondary.log_contains( + "incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr" + ) # clean up if slow_down_send: sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off")) + + +def test_2_replicas_start(neon_simple_env: NeonEnv): + env = neon_simple_env + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + time.sleep(1) + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary1" + ) as secondary1: + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary2" + ) as secondary2: + wait_replica_caughtup(primary, secondary1) + wait_replica_caughtup(primary, secondary2) + + +# Test two different scenarios related to gc of data needed by hot standby. +# +# When pause_apply is False, standby is mostly caught up with the primary. +# However, in compute <-> pageserver protocol version 1 only one LSN had been +# sent to the pageserver in page request, and to avoid waits in the pageserver +# it was last-written LSN cache value. If page hasn't been updated for a long +# time that resulted in an error from the pageserver: "Bad request: tried to +# request a page version that was garbage collected". For primary this wasn't a +# problem because pageserver always bumped LSN to the newest one; for standy +# that would be incorrect since we might get page fresher then apply LSN. Hence, +# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN +# in case of standby) and not_modified_since which could be used as an +# optimization to avoid waiting. +# +# https://github.com/neondatabase/neon/issues/6211 +# +# When pause_apply is True we model standby lagging behind primary (e.g. due to +# high max_standby_streaming_delay). To prevent pageserver from removing data +# still needed by the standby apply LSN is propagated in standby -> safekeepers +# -> broker -> pageserver flow so that pageserver could hold off gc for it. +@pytest.mark.parametrize("pause_apply", [False, True]) +def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool): + tenant_conf = { + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + timeline_id = env.initial_timeline + tenant_id = env.initial_tenant + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + # Protocol version 2 was introduced to fix the issue + # that this test exercises. With protocol version 1 it + # fails. + config_lines=["neon.protocol_version=2"], + ) as secondary: + p_cur = primary.connect().cursor() + p_cur.execute("CREATE EXTENSION neon_test_utils") + p_cur.execute("CREATE TABLE test (id int primary key) WITH (autovacuum_enabled=false)") + p_cur.execute("INSERT INTO test SELECT generate_series(1, 10000) AS g") + + wait_replica_caughtup(primary, secondary) + + s_cur = secondary.connect().cursor() + + s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()") + res = s_cur.fetchone() + assert res is not None + + s_cur.execute("SELECT COUNT(*) FROM test") + res = s_cur.fetchone() + assert res[0] == 10000 + + # Clear the cache in the standby, so that when we + # re-execute the query, it will make GetPage + # requests. This does not clear the last-written LSN cache + # so we still remember the LSNs of the pages. + s_cur.execute("SELECT clear_buffer_cache()") + + if pause_apply: + s_cur.execute("SELECT pg_wal_replay_pause()") + + # Do other stuff on the primary, to advance the WAL + p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g") + + # Run GC. The PITR interval is very small, so this advances the GC cutoff LSN + # very close to the primary's current insert LSN. + shards = tenant_get_shards(env, tenant_id, None) + for tenant_shard_id, pageserver in shards: + client = pageserver.http_client() + client.timeline_checkpoint(tenant_shard_id, timeline_id) + client.timeline_compact(tenant_shard_id, timeline_id) + client.timeline_gc(tenant_shard_id, timeline_id, 0) + + # Re-execute the query. The GetPage requests that this + # generates use old not_modified_since LSNs, older than + # the GC cutoff, but new request LSNs. (In protocol + # version 1 there was only one LSN, and this failed.) + log_replica_lag(primary, secondary) + s_cur.execute("SELECT COUNT(*) FROM test") + log_replica_lag(primary, secondary) + res = s_cur.fetchone() + assert res[0] == 10000 + + +def run_pgbench(connstr: str, pg_bin: PgBin): + log.info(f"Start a pgbench workload on pg {connstr}") + # s10 is about 150MB of data. In debug mode init takes about 15s on SSD. + pg_bin.run_capture(["pgbench", "-i", "-s10", connstr]) + log.info("pgbench init done") + pg_bin.run_capture(["pgbench", "-T60", connstr]) + + +# assert that pgbench_accounts and its index are created. +def pgbench_accounts_initialized(ep): + ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass") + + +# Test that hot_standby_feedback works in neon (it is forwarded through +# safekeepers). That is, ensure queries on standby don't fail during load on +# primary under the following conditions: +# - pgbench bombards primary with updates. +# - On the secondary we run long select of the updated table. +# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts +# so apply doesn't need to wait. +# - Do agressive vacuum on primary which still shouldn't create conflicts. +# Actually this appears to be redundant due to microvacuum existence. +# +# Without hs feedback enabled we'd see 'User query might have needed to see row +# versions that must be removed.' errors. +def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + agressive_vacuum_conf = [ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime = 10s", + "autovacuum_vacuum_threshold = 25", + "autovacuum_vacuum_scale_factor = 0.1", + "autovacuum_vacuum_cost_delay = -1", + ] + with env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf + ) as primary: + # It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with + # 'User was holding shared buffer pin for too long.'. + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=[ + "max_standby_streaming_delay=2s", + "neon.protocol_version=2", + "hot_standby_feedback=true", + ], + ) as secondary: + log.info( + f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}" + ) + t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin)) + t.start() + # Wait until pgbench_accounts is created + filled on replica *and* + # index is created. Otherwise index creation would conflict with + # read queries and hs feedback won't save us. + wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary)) + + # Test should fail if hs feedback is disabled anyway, but cross + # check that walproposer sets some xmin. + def xmin_is_not_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert int(slot_xmin) > 0 + + wait_until(10, 1.0, xmin_is_not_null) + for _ in range(1, 5): + # in debug mode takes about 5-7s + balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts") + log.info(f"balance={balance}") + log_replica_lag(primary, secondary) + t.join() + + # check xmin is reset when standby is gone + def xmin_is_null(): + slot_xmin = primary.safe_psql_scalar( + "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'", + log_query=False, + ) + log.info(f"xmin is {slot_xmin}") + assert slot_xmin is None + + wait_until(10, 1.0, xmin_is_null) + + +# Test race condition between WAL replay and backends performing queries +# https://github.com/neondatabase/neon/issues/7791 +def test_replica_query_race(neon_simple_env: NeonEnv): + env = neon_simple_env + + primary_ep = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) + + with primary_ep.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute("CREATE EXTENSION neon_test_utils") + p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter") + + standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby") + wait_replica_caughtup(primary_ep, standby_ep) + + # In primary, run a lot of UPDATEs on a single page + finished = False + writecounter = 1 + + async def primary_workload(): + nonlocal writecounter, finished + conn = await primary_ep.connect_async() + while writecounter < 10000: + writecounter += 1 + await conn.execute(f"UPDATE test SET counter = {writecounter}") + finished = True + + # In standby, at the same time, run queries on it. And repeatedly drop caches + async def standby_workload(): + nonlocal writecounter, finished + conn = await standby_ep.connect_async() + reads = 0 + while not finished: + readcounter = await conn.fetchval("SELECT counter FROM test") + + # Check that the replica is keeping up with the primary. In local + # testing, the lag between primary and standby is much smaller, in + # the ballpark of 2-3 counter values. But be generous in case there's + # some hiccup. + # assert(writecounter - readcounter < 1000) + assert readcounter <= writecounter + if reads % 100 == 0: + log.info(f"read {reads}: counter {readcounter}, last update {writecounter}") + reads += 1 + + await conn.execute("SELECT clear_buffer_cache()") + + async def both(): + await asyncio.gather( + primary_workload(), + standby_workload(), + ) + + asyncio.run(both()) diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index faedf5d944..ac27a4cf36 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -7,6 +7,7 @@ from contextlib import closing from pathlib import Path import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -20,8 +21,7 @@ from fixtures.pageserver.utils import ( wait_for_upload, ) from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import subprocess_capture +from fixtures.utils import assert_pageserver_backups_equal, subprocess_capture def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder): @@ -90,21 +90,10 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build [ ".*error importing base backup .*", ".*Timeline got dropped without initializing, cleaning its files.*", - ".*Removing intermediate uninit mark file.*", ".*InternalServerError.*timeline not found.*", ".*InternalServerError.*Tenant .* not found.*", ".*InternalServerError.*Timeline .* not found.*", ".*InternalServerError.*Cannot delete timeline which has child timelines.*", - ".*ignored .* unexpected bytes after the tar archive.*", - ] - ) - - env.pageserver.allowed_errors.extend( - [ - # FIXME: we should clean up pageserver to not print this - ".*exited with error: unexpected message type: CopyData.*", - # FIXME: Is this expected? - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", ] ) @@ -142,12 +131,9 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build with pytest.raises(RuntimeError): import_tar(corrupt_base_tar, wal_tar) - # A tar with trailing garbage is currently accepted. It prints a warnings - # to the pageserver log, however. Check that. - import_tar(base_plus_garbage_tar, wal_tar) - assert env.pageserver.log_contains( - ".*WARN.*ignored .* unexpected bytes after the tar archive.*" - ) + # Importing a tar with trailing garbage fails + with pytest.raises(RuntimeError): + import_tar(base_plus_garbage_tar, wal_tar) client = env.pageserver.http_client() timeline_delete_wait_completed(client, tenant, timeline) @@ -163,6 +149,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) assert endpoint.safe_psql("select count(*) from t") == [(300000,)] + vanilla_pg.stop() + def test_import_from_pageserver_small( pg_bin: PgBin, neon_env_builder: NeonEnvBuilder, test_output_dir: Path @@ -170,17 +158,12 @@ def test_import_from_pageserver_small( neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - timeline = env.neon_cli.create_branch("test_import_from_pageserver_small") endpoint = env.endpoints.create_start("test_import_from_pageserver_small") num_rows = 3000 lsn = _generate_data(num_rows, endpoint) - _import(num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir) + _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir) @pytest.mark.timeout(1800) @@ -210,9 +193,7 @@ def test_import_from_pageserver_multisegment( log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB") assert logical_size > 1024**3 # = 1GB - tar_output_file = _import( - num_rows, lsn, env, pg_bin, timeline, env.pg_distrib_dir, test_output_dir - ) + tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline, test_output_dir) # Check if the backup data contains multiple segment files cnt_seg_files = 0 @@ -252,7 +233,6 @@ def _import( env: NeonEnv, pg_bin: PgBin, timeline: TimelineId, - pg_distrib_dir: Path, test_output_dir: Path, ) -> Path: """Test importing backup data to the pageserver. @@ -265,15 +245,9 @@ def _import( path to the backup archive file""" log.info(f"start_backup_lsn = {lsn}") - # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq. - # PgBin sets it automatically, but here we need to pipe psql output to the tar command. - psql_env = {"LD_LIBRARY_PATH": str(pg_distrib_dir / "lib")} - # Get a fullbackup from pageserver - query = f"fullbackup { env.initial_tenant} {timeline} {lsn}" tar_output_file = test_output_dir / "fullbackup.tar" - cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query, "-o", str(tar_output_file)] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup(env.pageserver, env.initial_tenant, timeline, lsn, tar_output_file) # Stop the first pageserver instance, erase all its data env.endpoints.stop_all() @@ -318,26 +292,15 @@ def _import( wait_for_upload(client, tenant, timeline, lsn) # Check it worked - endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant) + endpoint = env.endpoints.create_start(endpoint_id, tenant_id=tenant, lsn=lsn) assert endpoint.safe_psql("select count(*) from tbl") == [(expected_num_rows,)] # Take another fullbackup - query = f"fullbackup { tenant} {timeline} {lsn}" new_tar_output_file = test_output_dir / "fullbackup-new.tar" - cmd = [ - "psql", - "--no-psqlrc", - env.pageserver.connstr(), - "-c", - query, - "-o", - str(new_tar_output_file), - ] - pg_bin.run_capture(cmd, env=psql_env) + pg_bin.take_fullbackup(env.pageserver, tenant, timeline, lsn, new_tar_output_file) # Check it's the same as the first fullbackup - # TODO pageserver should be checking checksum - assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file) + assert_pageserver_backups_equal(tar_output_file, new_tar_output_file, set()) # Check that gc works pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py new file mode 100644 index 0000000000..44c77b3410 --- /dev/null +++ b/test_runner/regress/test_ingestion_layer_size.py @@ -0,0 +1,151 @@ +from dataclasses import dataclass +from typing import Iterable, List, Union + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn +from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo +from fixtures.utils import human_bytes + + +def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str): + """ + Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector + to show that we no longer create oversized layers. + """ + + if build_type == "debug": + pytest.skip("debug run is unnecessarily slow") + + minimum_initdb_size = 20 * 1024**2 + checkpoint_distance = 32 * 1024**2 + minimum_good_layer_size = checkpoint_distance * 0.9 + minimum_too_large_layer_size = 2 * checkpoint_distance + + # index size: 99MiB + rows = 2_500_000 + + # bucket lower limits + buckets = [0, minimum_initdb_size, minimum_good_layer_size, minimum_too_large_layer_size] + + assert ( + minimum_initdb_size < minimum_good_layer_size + ), "keep checkpoint_distance higher than the initdb size (find it by experimenting)" + + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": f"{checkpoint_distance}", + "compaction_target_size": f"{checkpoint_distance}", + # this test is primarly interested in L0 sizes but we'll compact after ingestion to ensure sizes are good even then + "compaction_period": "0s", + "gc_period": "0s", + "compaction_threshold": "255", + "image_creation_threshold": "99999", + } + ) + + # build a larger than 3*checkpoint_distance sized gin index. + # gin index building exhibits the same behaviour as the pgvector with the two phase build + with env.endpoints.create_start("main") as ep, ep.cursor() as cur: + cur.execute( + f"create table int_array_test as select array_agg(g) as int_array from generate_series(1, {rows}) g group by g / 10;" + ) + cur.execute( + "create index int_array_test_gin_index on int_array_test using gin (int_array);" + ) + cur.execute("select pg_table_size('int_array_test_gin_index')") + size = cur.fetchone() + assert size is not None + assert isinstance(size[0], int) + log.info(f"gin index size: {human_bytes(size[0])}") + assert ( + size[0] > checkpoint_distance * 3 + ), f"gin index is not large enough: {human_bytes(size[0])}" + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + ps_http = env.pageserver.http_client() + ps_http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(infos.in_memory_layers) == 0, "should had flushed open layers" + post_ingest = histogram_historic_layers(infos, buckets) + + # describe first, assert later for easier debugging + log.info("non-cumulative layer size distribution after ingestion:") + print_layer_size_histogram(post_ingest) + + # since all we have are L0s, we should be getting nice L1s and images out of them now + ps_http.patch_tenant_config_client_side( + env.initial_tenant, + { + "compaction_threshold": 1, + "image_creation_threshold": 1, + }, + ) + + ps_http.timeline_compact(env.initial_tenant, env.initial_timeline, True, True) + + infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(infos.in_memory_layers) == 0, "no new inmem layers expected" + post_compact = histogram_historic_layers(infos, buckets) + + log.info("non-cumulative layer size distribution after compaction:") + print_layer_size_histogram(post_compact) + + assert ( + post_ingest.counts[3] == 0 + ), f"there should be no layers larger than 2*checkpoint_distance ({human_bytes(2*checkpoint_distance)})" + assert post_ingest.counts[1] == 1, "expect one smaller layer for initdb" + assert ( + post_ingest.counts[0] <= 1 + ), "expect at most one tiny layer from shutting down the endpoint" + + # just make sure we don't have trouble splitting the layers apart + assert post_compact.counts[3] == 0 + + +@dataclass +class Histogram: + buckets: List[Union[int, float]] + counts: List[int] + sums: List[int] + + +def histogram_historic_layers( + infos: LayerMapInfo, minimum_sizes: List[Union[int, float]] +) -> Histogram: + def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo: + log.info( + f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)" + ) + return layer + + layers = map(log_layer, infos.historic_layers) + sizes = (x.layer_file_size for x in layers) + return histogram(sizes, minimum_sizes) + + +def histogram(sizes: Iterable[int], minimum_sizes: List[Union[int, float]]) -> Histogram: + assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1)) + buckets = list(enumerate(minimum_sizes)) + counts = [0 for _ in buckets] + sums = [0 for _ in buckets] + + for size in sizes: + found = False + for index, min_size in reversed(buckets): + if size >= min_size: + counts[index] += 1 + sums[index] += size + found = True + break + assert found + + return Histogram(minimum_sizes, counts, sums) + + +def print_layer_size_histogram(h: Histogram): + for index, min_size in enumerate(h.buckets): + log.info( + f">= {human_bytes(min_size)}: {h.counts[index]} layers total {human_bytes(h.sums[index])}" + ) diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py index b6ac1aa41f..c5d5b5fe64 100644 --- a/test_runner/regress/test_large_schema.py +++ b/test_runner/regress/test_large_schema.py @@ -74,8 +74,8 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") # Check layer file sizes - timeline_path = "{}/tenants/{}/timelines/{}/".format( - env.pageserver.workdir, env.initial_tenant, env.initial_timeline + timeline_path = ( + f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{env.initial_timeline}/" ) for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py new file mode 100644 index 0000000000..77dc8a35b5 --- /dev/null +++ b/test_runner/regress/test_layer_bloating.py @@ -0,0 +1,65 @@ +import os +import time + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + logical_replication_sync, + wait_for_last_flush_lsn, +) +from fixtures.pg_version import PgVersion + + +def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg): + env = neon_simple_env + + if env.pg_version != PgVersion.V16: + pytest.skip("pg_log_standby_snapshot() function is available only in PG16") + + timeline = env.neon_cli.create_branch("test_logical_replication", "empty") + endpoint = env.endpoints.create_start( + "test_logical_replication", config_lines=["log_statement=all"] + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # create table... + cur.execute("create table t(pk integer primary key)") + cur.execute("create publication pub1 for table t") + # Create slot to hold WAL + cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + # now start subscriber + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key)") + + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + cur.execute( + """create or replace function create_snapshots(n integer) returns void as $$ + declare + i integer; + begin + for i in 1..n loop + perform pg_log_standby_snapshot(); + end loop; + end; $$ language plpgsql""" + ) + cur.execute("set statement_timeout=0") + cur.execute("select create_snapshots(10000)") + # Wait logical replication to sync + logical_replication_sync(vanilla_pg, endpoint) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline) + time.sleep(10) + + # Check layer file sizes + timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/" + log.info(f"Check {timeline_path}") + for filename in os.listdir(timeline_path): + if filename.startswith("00000"): + log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}") + assert os.path.getsize(timeline_path + filename) < 512_000_000 diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index efba2033fb..193149ea03 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -4,12 +4,12 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + flush_ep_to_pageserver, wait_for_last_flush_lsn, ) -from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.pageserver.common_types import parse_layer_file_name +from fixtures.pageserver.utils import wait_for_upload from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn -from fixtures.utils import query_scalar # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway) @@ -46,20 +46,21 @@ def test_basic_eviction( FROM generate_series(1, 5000000) g """ ) - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn) + # stops the endpoint + current_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(client, tenant_id, timeline_id, current_lsn) - # disable compute & sks to avoid on-demand downloads by walreceiver / getpage - endpoint.stop() + # stop sks to avoid on-demand downloads by walreceiver / getpage; endpoint + # has already been stopped by flush_ep_to_pageserver for sk in env.safekeepers: sk.stop() - timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id) - initial_local_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + initial_local_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( len(initial_local_layers) > 1 @@ -73,6 +74,7 @@ def test_basic_eviction( assert len(initial_local_layers) == len( initial_layer_map_info.historic_layers ), "Should have the same layers in memory and on disk" + for returned_layer in initial_layer_map_info.historic_layers: assert ( returned_layer.kind == "Delta" @@ -81,27 +83,29 @@ def test_basic_eviction( not returned_layer.remote ), f"All created layers should be present locally, but got {returned_layer}" - local_layers = list( - filter(lambda layer: layer.name == returned_layer.layer_file_name, initial_local_layers) + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) + assert ( + returned_layer_name in initial_local_layers + ), f"Did not find returned layer {returned_layer_name} in local layers {list(initial_local_layers.keys())}" + + local_layer_path = ( + env.pageserver.timeline_dir(tenant_id, timeline_id) + / initial_local_layers[returned_layer_name] ) assert ( - len(local_layers) == 1 - ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" - local_layer = local_layers[0] - assert ( - returned_layer.layer_file_size == local_layer.stat().st_size - ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer}" + returned_layer.layer_file_size == local_layer_path.stat().st_size + ), f"Returned layer {returned_layer} has a different file size than local layer {local_layer_path}" # Detach all layers, ensre they are not in the local FS, but are still dumped as part of the layer map - for local_layer in initial_local_layers: + for local_layer_name, local_layer_path in initial_local_layers.items(): client.evict_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_path.name ) - assert not any( - new_local_layer.name == local_layer.name for new_local_layer in timeline_path.glob("*") - ), f"Did not expect to find {local_layer} layer after evicting" + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, local_layer_name + ), f"Did not expect to find {local_layer_name} layer after evicting" - empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + empty_layers = env.pageserver.list_layers(tenant_id, timeline_id) assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}" evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id) @@ -118,15 +122,15 @@ def test_basic_eviction( assert ( returned_layer.remote ), f"All layers should be evicted and not present locally, but got {returned_layer}" - assert any( - local_layer.name == returned_layer.layer_file_name - for local_layer in initial_local_layers + returned_layer_name = parse_layer_file_name(returned_layer.layer_file_name) + assert ( + returned_layer_name in initial_local_layers ), f"Did not find returned layer {returned_layer} in local layers {initial_local_layers}" # redownload all evicted layers and ensure the initial state is restored - for local_layer in initial_local_layers: + for local_layer_name, _local_layer_path in initial_local_layers.items(): client.download_layer( - tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer.name + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=local_layer_name.to_str() ) client.timeline_download_remote_layers( tenant_id, @@ -137,8 +141,9 @@ def test_basic_eviction( at_least_one_download=False, ) - redownloaded_layers = sorted( - list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + redownloaded_layers = dict( + (parse_layer_file_name(path.name), path) + for path in env.pageserver.list_layers(tenant_id, timeline_id) ) assert ( redownloaded_layers == initial_local_layers @@ -154,7 +159,9 @@ def test_basic_eviction( def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + # don't create initial tenant, we'll create it manually with custom config + env = neon_env_builder.init_configs() + env.start() tenant_config = { "pitr_interval": "1s", # set to non-zero, so GC actually does something @@ -165,6 +172,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): "compaction_threshold": "3", # "image_creation_threshold": set at runtime "compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers + "image_layer_creation_check_threshold": "0", # always check if a new image layer can be created } def tenant_update_config(changes): @@ -264,14 +272,14 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): resident_physical_size_metric == 0 ), "ensure that resident_physical_size metric is zero" assert resident_physical_size_metric == sum( - layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote + layer.layer_file_size for layer in info.historic_layers if not layer.remote ), "ensure that resident_physical_size metric corresponds to layer map dump" remote_physical_size_metric = ps_http.get_timeline_metric( tenant_id, timeline_id, "pageserver_remote_physical_size" ) assert remote_physical_size_metric == sum( - layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote + layer.layer_file_size for layer in info.historic_layers if layer.remote ), "ensure that remote_physical_size metric corresponds to layer map dump" log.info("before runnning GC, ensure that remote_physical size is zero") diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 999e077e45..54d3b2d515 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -1,10 +1,11 @@ import time +from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.pageserver.types import ( - DeltaLayerFileName, - ImageLayerFileName, +from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver +from fixtures.pageserver.common_types import ( + DeltaLayerName, + ImageLayerName, is_future_layer, ) from fixtures.pageserver.utils import ( @@ -13,7 +14,6 @@ from fixtures.pageserver.utils import ( wait_until_tenant_active, ) from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until @@ -37,7 +37,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + env = neon_env_builder.init_configs() + env.start() env.pageserver.allowed_errors.extend( [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] ) @@ -53,6 +54,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): "checkpoint_timeout": "24h", # something we won't reach "checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually "image_creation_threshold": "100", # we want to control when image is created + "image_layer_creation_check_threshold": "0", "compaction_threshold": f"{l0_l1_threshold}", "compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers } @@ -80,7 +82,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): current = get_index_part() assert len(set(current.layer_metadata.keys())) == 1 layer_file_name = list(current.layer_metadata.keys())[0] - assert isinstance(layer_file_name, DeltaLayerFileName) + assert isinstance(layer_file_name, DeltaLayerName) assert layer_file_name.is_l0(), f"{layer_file_name}" log.info("force image layer creation in the future by writing some data into in-memory layer") @@ -115,8 +117,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): ) == 0 ) - - endpoint.stop() + last_record_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) @@ -146,7 +147,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): future_layers = get_future_layers() assert len(future_layers) == 1 future_layer = future_layers[0] - assert isinstance(future_layer, ImageLayerFileName) + assert isinstance(future_layer, ImageLayerName) assert future_layer.lsn == last_record_lsn log.info( f"got layer from the future: lsn={future_layer.lsn} disk_consistent_lsn={ip.disk_consistent_lsn} last_record_lsn={last_record_lsn}" @@ -160,7 +161,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): time.sleep(1.1) # so that we can use change in pre_stat.st_mtime to detect overwrites def get_generation_number(): - attachment = env.attachment_service.inspect(tenant_id) + attachment = env.storage_controller.inspect(tenant_id) assert attachment is not None return attachment[0] @@ -184,10 +185,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): # NB: the layer file is unlinked index part now, but, because we made the delete # operation stuck, the layer file itself is still in the remote_storage - def delete_at_pause_point(): - assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}") - - wait_until(10, 0.5, delete_at_pause_point) + wait_until( + 10, + 0.5, + lambda: env.pageserver.assert_log_contains( + f".*{tenant_id}.*at failpoint.*{failpoint_name}" + ), + ) future_layer_path = env.pageserver_remote_storage.remote_layer_path( tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach ) diff --git a/test_runner/regress/test_lfc_resize.py b/test_runner/regress/test_lfc_resize.py index 5c68a63d06..2a3442448a 100644 --- a/test_runner/regress/test_lfc_resize.py +++ b/test_runner/regress/test_lfc_resize.py @@ -23,7 +23,6 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin): ) n_resize = 10 scale = 10 - log.info("postgres is running on 'test_lfc_resize' branch") def run_pgbench(connstr: str): log.info(f"Start a pgbench workload on pg {connstr}") diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py new file mode 100644 index 0000000000..a6f05fe0f7 --- /dev/null +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -0,0 +1,74 @@ +from pathlib import Path + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import query_scalar + + +def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + cache_dir = Path(env.repo_dir) / "file_cache" + cache_dir.mkdir(exist_ok=True) + + branchname = "test_approximate_working_set_size" + env.neon_cli.create_branch(branchname, "empty") + log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}") + endpoint = env.endpoints.create_start( + branchname, + config_lines=[ + "shared_buffers='1MB'", + f"neon.file_cache_path='{cache_dir}/file.cache'", + "neon.max_file_cache_size='128MB'", + "neon.file_cache_size_limit='64MB'", + ], + ) + + cur = endpoint.connect().cursor() + cur.execute("create extension neon") + + log.info(f"preparing some data in {endpoint.connstr()}") + + ddl = """ +CREATE TABLE pgbench_accounts ( + aid bigint NOT NULL, + bid integer, + abalance integer, + filler character(84), + -- more web-app like columns + text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5), + jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb +) +WITH (fillfactor='100'); +""" + + cur.execute(ddl) + # prepare index access below + cur.execute( + "ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)" + ) + cur.execute( + "insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;" + ) + # ensure correct query plans and stats + cur.execute("vacuum ANALYZE pgbench_accounts") + # determine table size - working set should approximate table size after sequential scan + pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'") + log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero") + cur.execute("select approximate_working_set_size(true)") + cur.execute( + 'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb' + ) + # verify working set size after sequential scan matches table size and reset working set for next test + blocks = query_scalar(cur, "select approximate_working_set_size(true)") + log.info(f"working set size after sequential scan on pgbench_accounts {blocks}") + assert pages * 0.8 < blocks < pages * 1.2 + # run a few point queries with index lookup + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 4242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 54242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242") + cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242") + # verify working set size after some index access of a few select pages only + blocks = query_scalar(cur, "select approximate_working_set_size(true)") + log.info(f"working set size after some index access of a few select pages only {blocks}") + assert blocks < 10 diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py index 38f2034c18..76c6581448 100644 --- a/test_runner/regress/test_local_file_cache.py +++ b/test_runner/regress/test_local_file_cache.py @@ -4,16 +4,21 @@ import threading import time from typing import List -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder from fixtures.utils import query_scalar -def test_local_file_cache_unlink(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str): + if build_type == "debug": + # Disable vectored read path cross validation since it makes the test time out. + neon_env_builder.pageserver_config_override = "validate_vectored_get=false" + + env = neon_env_builder.init_start() cache_dir = os.path.join(env.repo_dir, "file_cache") os.mkdir(cache_dir) + env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) env.neon_cli.create_branch("test_local_file_cache_unlink", "empty") endpoint = env.endpoints.create_start( diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py index d559be0a8f..bfffad7572 100644 --- a/test_runner/regress/test_logging.py +++ b/test_runner/regress/test_logging.py @@ -3,10 +3,12 @@ import uuid import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pg_version import run_only_on_default_postgres from fixtures.utils import wait_until @pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"]) +@run_only_on_default_postgres("it does not use any postgres functionality") def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): # self-test: make sure the event is logged (i.e., our testing endpoint works) log_expected = { @@ -32,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): def assert_logged(): if not log_expected: return - assert env.pageserver.log_contains(f".*{msg_id}.*") + env.pageserver.assert_log_contains(f".*{msg_id}.*") wait_until(10, 0.5, assert_logged) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 51e358e60d..ca3c81d6e5 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -1,16 +1,38 @@ import time +from functools import partial +from random import choice +from string import ascii_lowercase import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( + AuxFileStore, NeonEnv, + NeonEnvBuilder, logical_replication_sync, wait_for_last_flush_lsn, ) -from fixtures.types import Lsn -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until +def random_string(n: int): + return "".join([choice(ascii_lowercase) for _ in range(n)]) + + +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.V2, AuxFileStore.CrossValidation] +) +def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore): + env = neon_simple_env + with env.pageserver.http_client() as client: + tenant_config = client.tenant_config(env.initial_tenant).effective_config + assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"] + + +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -20,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): "test_logical_replication", config_lines=["log_statement=all"] ) - log.info("postgres is running on 'test_logical_replication' branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -152,8 +173,164 @@ COMMIT; assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1 +# Test that neon.logical_replication_max_snap_files works +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) +def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): + def slot_removed(ep): + assert ( + endpoint.safe_psql( + "select count(*) from pg_replication_slots where slot_name = 'stale_slot'" + )[0][0] + == 0 + ) + + env = neon_simple_env + + env.neon_cli.create_branch("test_logical_replication", "empty") + # set low neon.logical_replication_max_snap_files + endpoint = env.endpoints.create_start( + "test_logical_replication", + config_lines=["log_statement=all", "neon.logical_replication_max_snap_files=1"], + ) + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # create obsolete slot + cur.execute("select pg_create_logical_replication_slot('stale_slot', 'pgoutput');") + assert ( + endpoint.safe_psql( + "select count(*) from pg_replication_slots where slot_name = 'stale_slot'" + )[0][0] + == 1 + ) + + # now insert some data and create and start live subscriber to create more .snap files + # (in most cases this is not needed as stale_slot snap will have higher LSN than restart_lsn anyway) + cur.execute("create table t(pk integer primary key, payload integer)") + cur.execute("create publication pub1 for table t") + + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)") + connstr = endpoint.connstr().replace("'", "''") + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint)) + + +def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute("create table wal_generator (id serial primary key, data text)") + cur.execute( + "SELECT * FROM pg_create_logical_replication_slot('slotty_mcslotface', 'test_decoding')" + ) + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + + endpoint.stop_and_destroy() + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute( + "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" + ) + + +# Tests that walsender correctly blocks until WAL is downloaded from safekeepers +def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute("create table wal_generator (id serial primary key, data text)") + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + cur.execute("create table t(a int)") + cur.execute("create publication pub for table t") + cur.execute("insert into t values (1)") + + vanilla_pg.start() + vanilla_pg.safe_psql("create table t(a int)") + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub") + logical_replication_sync(vanilla_pg, endpoint) + + vanilla_pg.stop() + + # Pause the safekeepers so that they can't send WAL (except to pageserver) + for sk in env.safekeepers: + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-pause-send", "return")]) + + # Insert a 2 + with endpoint.connect().cursor() as cur: + cur.execute("insert into t values (2)") + + endpoint.stop_and_destroy() + + # This new endpoint should contain [1, 2], but it can't access WAL from safekeeper + endpoint = env.endpoints.create_start("init") + with endpoint.connect().cursor() as cur: + cur.execute("select * from t") + res = [r[0] for r in cur.fetchall()] + assert res == [1, 2] + + # Reconnect subscriber + vanilla_pg.start() + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"alter subscription sub1 connection '{connstr}'") + + time.sleep(5) + # Make sure the 2 isn't replicated + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1] + + # Re-enable WAL download + for sk in env.safekeepers: + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-pause-send", "off")]) + + logical_replication_sync(vanilla_pg, endpoint) + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2] + + # Check that local reads also work + with endpoint.connect().cursor() as cur: + cur.execute("insert into t values (3)") + logical_replication_sync(vanilla_pg, endpoint) + assert [r[0] for r in vanilla_pg.safe_psql("select * from t")] == [1, 2, 3] + + log_path = vanilla_pg.pgdatadir / "pg.log" + with open(log_path, "r") as log_file: + logs = log_file.read() + assert "could not receive data from WAL stream" not in logs + + # Test compute start at LSN page of which starts with contrecord # https://github.com/neondatabase/neon/issues/5749 +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -238,6 +415,60 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): ) == endpoint.safe_psql("select sum(somedata) from replication_example") +# Test that WAL redo works for fairly large records. +# +# See https://github.com/neondatabase/neon/pull/6534. That wasn't a +# logical replication bug as such, but without logical replication, +# records passed ot the WAL redo process are never large enough to hit +# the bug. +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) +def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): + env = neon_simple_env + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + cur = endpoint.connect().cursor() + cur.execute("CREATE TABLE reptbl(id int, largeval text);") + cur.execute("alter table reptbl replica identity full") + cur.execute("create publication pub1 for table reptbl") + + # now start subscriber + vanilla_pg.start() + vanilla_pg.safe_psql("CREATE TABLE reptbl(id int, largeval text);") + + log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") + connstr = endpoint.connstr().replace("'", "''") + vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") + + # Test simple insert, update, delete. But with very large values + value = random_string(10_000_000) + cur.execute(f"INSERT INTO reptbl VALUES (1, '{value}')") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(1, value)] + + # Test delete, and reinsert another value + cur.execute("DELETE FROM reptbl WHERE id = 1") + cur.execute(f"INSERT INTO reptbl VALUES (2, '{value}')") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] + + value = random_string(10_000_000) + cur.execute(f"UPDATE reptbl SET largeval='{value}'") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] + + endpoint.stop() + endpoint.start() + cur = endpoint.connect().cursor() + value = random_string(10_000_000) + cur.execute(f"UPDATE reptbl SET largeval='{value}'") + logical_replication_sync(vanilla_pg, endpoint) + assert vanilla_pg.safe_psql("select id, largeval from reptbl") == [(2, value)] + + # # Check that slots are not inherited in brnach # @@ -258,8 +489,74 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): # Create branch ws. env.neon_cli.create_branch("ws", "main", tenant_id=tenant) ws_branch = env.endpoints.create_start("ws", tenant_id=tenant) - log.info("postgres is running on 'ws' branch") # Check that we can create slot with the same name ws_cur = ws_branch.connect().cursor() ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + +@pytest.mark.parametrize( + "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] +) +def test_replication_shutdown(neon_simple_env: NeonEnv): + # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed + env = neon_simple_env + env.neon_cli.create_branch("test_replication_shutdown_publisher", "empty") + pub = env.endpoints.create("test_replication_shutdown_publisher") + + env.neon_cli.create_branch("test_replication_shutdown_subscriber") + sub = env.endpoints.create("test_replication_shutdown_subscriber") + + pub.respec(skip_pg_catalog_updates=False) + pub.start() + + sub.respec(skip_pg_catalog_updates=False) + sub.start() + + pub.wait_for_migrations() + sub.wait_for_migrations() + + with pub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + # If we don't do this, creating the subscription will fail later on PG16 + pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"]) + + with sub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE PUBLICATION pub FOR ALL TABLES") + cur.execute("CREATE TABLE t (a int)") + cur.execute("INSERT INTO t VALUES (10), (20)") + cur.execute("SELECT * from t") + res = cur.fetchall() + assert [r[0] for r in res] == [10, 20] + + with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE TABLE t (a int)") + + pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + log.info(f"Creating subscription: {query}") + cur.execute(query) + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur: + pcur.execute("INSERT INTO t VALUES (30), (40)") + + def check_that_changes_propagated(): + cur.execute("SELECT * FROM t") + res = cur.fetchall() + log.info(res) + assert len(res) == 4 + assert [r[0] for r in res] == [10, 20, 30, 40] + + wait_until(10, 0.5, check_that_changes_propagated) diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 65d6d7a9fd..263730a823 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -1,11 +1,15 @@ +import re import time +from concurrent.futures import ThreadPoolExecutor from datetime import datetime, timedelta, timezone +import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.pageserver.http import PageserverApiException -from fixtures.types import Lsn -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until +from requests.exceptions import ReadTimeout # @@ -28,7 +32,6 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id) endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id) timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0] - log.info("postgres is running on 'main' branch") cur = endpoint_main.connect().cursor() @@ -64,18 +67,14 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Check edge cases # Timestamp is in the future probe_timestamp = tbl[-1][1] + timedelta(hours=1) - result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" - ) + result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp) assert result["kind"] == "future" # make sure that we return a well advanced lsn here assert Lsn(result["lsn"]) > start_lsn # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" - ) + result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp) assert result["kind"] == "past" # make sure that we return the minimum lsn here at the start of the range assert Lsn(result["lsn"]) < start_lsn @@ -83,9 +82,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Probe a bunch of timestamps in the valid range for i in range(1, len(tbl), 100): probe_timestamp = tbl[i][1] - result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z" - ) + result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id, probe_timestamp) assert result["kind"] not in ["past", "nodata"] lsn = result["lsn"] # Call get_lsn_by_timestamp to get the LSN @@ -108,29 +105,74 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder): # Timestamp is in the unreachable past probe_timestamp = tbl[0][1] - timedelta(hours=10) - result = client.timeline_get_lsn_by_timestamp( - tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2 - ) + result = client.timeline_get_lsn_by_timestamp(tenant_id, timeline_id_child, probe_timestamp) assert result["kind"] == "past" # make sure that we return the minimum lsn here at the start of the range assert Lsn(result["lsn"]) >= last_flush_lsn +def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder): + """ + Test if cancelled pageserver get_lsn_by_timestamp request is correctly handled. + Added as an effort to improve error handling and avoid full anyhow backtrace. + """ + + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend( + [ + ".*request was dropped before completing.*", + ".*Cancelled request finished with an error: Cancelled", + ] + ) + + client = env.pageserver.http_client() + failpoint = "find-lsn-for-timestamp-pausable" + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + # Request get_lsn_by_timestamp, hit the pausable failpoint + failing = exec.submit( + client.timeline_get_lsn_by_timestamp, + env.initial_tenant, + env.initial_timeline, + datetime.now(), + timeout=2, + ) + + _, offset = wait_until( + 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + with pytest.raises(ReadTimeout): + failing.result() + + client.configure_failpoints((failpoint, "off")) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains( + "Cancelled request finished with an error: Cancelled$", offset + ), + ) + + # Test pageserver get_timestamp_of_lsn API def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): + key_not_found_error = r".*could not find data for key.*" + env = neon_env_builder.init_start() new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api") endpoint_main = env.endpoints.create_start("test_ts_of_lsn_api") - log.info("postgres is running on 'test_ts_of_lsn_api' branch") cur = endpoint_main.connect().cursor() # Create table, and insert rows, each in a separate transaction - # Disable synchronous_commit to make this initialization go faster. + # Enable synchronous commit as we are timing sensitive # # Each row contains current insert LSN and the current timestamp, when # the row was inserted. - cur.execute("SET synchronous_commit=off") + cur.execute("SET synchronous_commit=on") cur.execute("CREATE TABLE foo (x integer)") tbl = [] for i in range(1000): @@ -139,7 +181,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc) after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()") tbl.append([i, after_timestamp, after_lsn]) - time.sleep(0.005) + time.sleep(0.02) # Execute one more transaction with synchronous_commit enabled, to flush # all the previous transactions @@ -187,8 +229,8 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): raise RuntimeError("there should have been an 'could not find data for key' error") except PageserverApiException as error: assert error.status_code == 500 - assert str(error).startswith("could not find data for key") - env.pageserver.allowed_errors.append(".*could not find data for key.*") + assert re.match(key_not_found_error, str(error)) + env.pageserver.allowed_errors.append(key_not_found_error) # Probe a bunch of timestamps in the valid range step_size = 100 diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py new file mode 100644 index 0000000000..5637f160cf --- /dev/null +++ b/test_runner/regress/test_migrations.py @@ -0,0 +1,35 @@ +import time + +from fixtures.neon_fixtures import NeonEnv + + +def test_migrations(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_migrations", "empty") + + endpoint = env.endpoints.create("test_migrations") + endpoint.respec(skip_pg_catalog_updates=False) + endpoint.start() + + endpoint.wait_for_migrations() + + num_migrations = 9 + + with endpoint.cursor() as cur: + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cur.fetchall() + assert migration_id[0][0] == num_migrations + + endpoint.assert_log_contains(f"INFO handle_migrations: Ran {num_migrations} migrations") + + endpoint.stop() + endpoint.start() + # We don't have a good way of knowing that the migrations code path finished executing + # in compute_ctl in the case that no migrations are being run + time.sleep(1) + with endpoint.cursor() as cur: + cur.execute("SELECT id FROM neon_migration.migration_id") + migration_id = cur.fetchall() + assert migration_id[0][0] == num_migrations + + endpoint.assert_log_contains("INFO handle_migrations: Ran 0 migrations") diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py index 9db463dc4a..88f7a5db59 100644 --- a/test_runner/regress/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -1,4 +1,3 @@ -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content from fixtures.utils import query_scalar @@ -18,7 +17,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch("test_multixact", "empty") endpoint = env.endpoints.create_start("test_multixact") - log.info("postgres is running on 'test_multixact' branch") cur = endpoint.connect().cursor() cur.execute( """ @@ -78,7 +76,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch("test_multixact_new", "test_multixact", ancestor_start_lsn=lsn) endpoint_new = env.endpoints.create_start("test_multixact_new") - log.info("postgres is running on 'test_multixact_new' branch") next_multixact_id_new = endpoint_new.safe_psql( "SELECT next_multixact_id FROM pg_control_checkpoint()" )[0][0] diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 16d120e24a..ba170cfb4c 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -5,6 +5,7 @@ from typing import cast import pytest import requests +from fixtures.common_types import TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, @@ -13,7 +14,6 @@ from fixtures.neon_fixtures import ( ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion, skip_on_postgres -from fixtures.types import TenantId, TimelineId def helper_compare_timeline_list( @@ -133,7 +133,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): # Stop default ps/sk env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() - env.neon_cli.attachment_service_stop(False) + env.neon_cli.storage_controller_stop(False) # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageserver.running = False @@ -175,7 +175,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2) # Stop this to get out of the way of the following `start` - env.neon_cli.attachment_service_stop(False) + env.neon_cli.storage_controller_stop(False) # Default start res = env.neon_cli.raw_cli(["start"]) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 998f84f968..39b4865026 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -1,3 +1,4 @@ +import time from contextlib import closing from fixtures.log_helper import log @@ -14,8 +15,6 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): endpoint_main.respec(skip_pg_catalog_updates=False) endpoint_main.start() - log.info("postgres is running on 'test_create_extension_neon' branch") - with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("SELECT extversion from pg_extension where extname='neon'") @@ -25,4 +24,73 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.1",) + assert cur.fetchone() == ("1.3",) + cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") + res = cur.fetchall() + log.info(res) + assert len(res) == 1 + assert len(res[0]) == 5 + + +# Verify that the neon extension can be upgraded/downgraded. +def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_neon_extension_compatibility") + + endpoint_main = env.endpoints.create("test_neon_extension_compatibility") + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SELECT extversion from pg_extension where extname='neon'") + # IMPORTANT: + # If the version has changed, the test should be updated. + # Ensure that the default version is also updated in the neon.control file + assert cur.fetchone() == ("1.3",) + cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") + all_versions = ["1.3", "1.2", "1.1", "1.0"] + current_version = "1.3" + for idx, begin_version in enumerate(all_versions): + for target_version in all_versions[idx + 1 :]: + if current_version != begin_version: + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {current_version}->{begin_version}" + ) + current_version = begin_version + # downgrade + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{target_version}'; -- {begin_version}->{target_version}" + ) + # upgrade + cur.execute( + f"ALTER EXTENSION neon UPDATE TO '{begin_version}'; -- {target_version}->{begin_version}" + ) + + +# Verify that the neon extension can be auto-upgraded to the latest version. +def test_neon_extension_auto_upgrade(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_neon_extension_auto_upgrade") + + endpoint_main = env.endpoints.create("test_neon_extension_auto_upgrade") + # don't skip pg_catalog updates - it runs CREATE EXTENSION neon + endpoint_main.respec(skip_pg_catalog_updates=False) + endpoint_main.start() + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("ALTER EXTENSION neon UPDATE TO '1.0';") + cur.execute("SELECT extversion from pg_extension where extname='neon'") + assert cur.fetchone() == ("1.0",) # Ensure the extension gets downgraded + + endpoint_main.stop() + time.sleep(1) + endpoint_main.start() + time.sleep(1) + + with closing(endpoint_main.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SELECT extversion from pg_extension where extname='neon'") + assert cur.fetchone() != ("1.0",) # Ensure the extension gets upgraded diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 46b72fbca5..8edba49b8a 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail( env.neon_cli.endpoint_stop("ep1") # ep1 is stopped so create ep2 will succeed env.neon_cli.endpoint_start("ep2") + # cleanup + env.neon_cli.endpoint_stop("ep2") diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py new file mode 100644 index 0000000000..fd31df84da --- /dev/null +++ b/test_runner/regress/test_neon_superuser.py @@ -0,0 +1,99 @@ +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from fixtures.pg_version import PgVersion +from fixtures.utils import wait_until + + +def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): + env = neon_simple_env + env.neon_cli.create_branch("test_neon_superuser_publisher", "empty") + pub = env.endpoints.create("test_neon_superuser_publisher") + + env.neon_cli.create_branch("test_neon_superuser_subscriber") + sub = env.endpoints.create("test_neon_superuser_subscriber") + + pub.respec(skip_pg_catalog_updates=False) + pub.start() + + sub.respec(skip_pg_catalog_updates=False) + sub.start() + + pub.wait_for_migrations() + sub.wait_for_migrations() + + with pub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + # If we don't do this, creating the subscription will fail later on PG16 + pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"]) + + with sub.cursor() as cur: + cur.execute( + "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser" + ) + cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers") + cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser") + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')") + assert cur.fetchall()[0][0] + cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')") + assert cur.fetchall()[0][0] + + if pg_version == PgVersion.V16: + cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'set')") + assert cur.fetchall()[0][0] + + cur.execute("CREATE PUBLICATION pub FOR ALL TABLES") + cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'") + cur.execute("CREATE DATABASE definitely_a_database") + cur.execute("CREATE TABLE t (a int)") + cur.execute("INSERT INTO t VALUES (10), (20)") + cur.execute("SELECT * from t") + res = cur.fetchall() + assert [r[0] for r in res] == [10, 20] + + with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur: + cur.execute("CREATE TABLE t (a int)") + + pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + log.info(f"Creating subscription: {query}") + cur.execute(query) + + with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur: + pcur.execute("INSERT INTO t VALUES (30), (40)") + + def check_that_changes_propagated(): + cur.execute("SELECT * FROM t") + res = cur.fetchall() + log.info(res) + assert len(res) == 4 + assert [r[0] for r in res] == [10, 20, 30, 40] + + wait_until(10, 0.5, check_that_changes_propagated) + + # Test that pg_monitor is working for neon_superuser role + cur.execute("SELECT query from pg_stat_activity LIMIT 1") + assert cur.fetchall()[0][0] != "" + # Test that pg_monitor is not working for non neon_superuser role without grant + cur.execute("CREATE ROLE not_a_superuser LOGIN PASSWORD 'Password42!'") + cur.execute("GRANT not_a_superuser TO neon_superuser WITH ADMIN OPTION") + cur.execute("SET ROLE not_a_superuser") + cur.execute("SELECT query from pg_stat_activity LIMIT 1") + assert cur.fetchall()[0][0] == "" + cur.execute("RESET ROLE") + # Test that pg_monitor is working for non neon_superuser role with grant + cur.execute("GRANT pg_monitor TO not_a_superuser") + cur.execute("SET ROLE not_a_superuser") + cur.execute("SELECT query from pg_stat_activity LIMIT 1") + assert cur.fetchall()[0][0] != "" + cur.execute("RESET ROLE") + cur.execute("DROP ROLE not_a_superuser") + query = "DROP SUBSCRIPTION sub CASCADE" + log.info(f"Dropping subscription: {query}") + cur.execute(query) diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index 6e94e15227..98fb06a0d6 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -1,10 +1,18 @@ +import json +import os import time +from pathlib import Path -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_wal_insert_lsn +from fixtures.pageserver.utils import ( + wait_for_last_record_lsn, +) +from fixtures.remote_storage import RemoteStorageKind +from fixtures.utils import query_scalar -# Test restarting page server, while safekeeper and compute node keep -# running. def test_next_xid(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() @@ -52,3 +60,166 @@ def test_next_xid(neon_env_builder: NeonEnvBuilder): cur = conn.cursor() cur.execute("SELECT count(*) FROM t") assert cur.fetchone() == (iterations,) + + +# Test for a bug we had, where nextXid was incorrectly updated when the +# XID counter reached 2 billion. The nextXid tracking logic incorrectly +# treated 0 (InvalidTransactionId) as a regular XID, and after reaching +# 2 billion, it started to look like a very new XID, which caused nextXid +# to be immediately advanced to the next epoch. +# +def test_import_at_2bil( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_bin: PgBin, + vanilla_pg, +): + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Reset the vanilla Postgres instance to somewhat before 2 billion transactions. + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [pg_resetwal_path, "--next-transaction-id=2129920000", "-D", str(vanilla_pg.pgdatadir)] + pg_bin.run_capture(cmd) + + vanilla_pg.start() + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + vanilla_pg.safe_psql( + """create table tt as select 'long string to consume some space' || g + from generate_series(1,300000) g""" + ) + assert vanilla_pg.safe_psql("select count(*) from tt") == [(300000,)] + vanilla_pg.safe_psql("CREATE TABLE t (t text);") + vanilla_pg.safe_psql("INSERT INTO t VALUES ('inserted in vanilla')") + + endpoint_id = "ep-import_from_vanilla" + tenant = TenantId.generate() + timeline = TimelineId.generate() + + env.pageserver.tenant_create(tenant) + + # Take basebackup + basebackup_dir = os.path.join(test_output_dir, "basebackup") + base_tar = os.path.join(basebackup_dir, "base.tar") + wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") + os.mkdir(basebackup_dir) + vanilla_pg.safe_psql("CHECKPOINT") + pg_bin.run( + [ + "pg_basebackup", + "-F", + "tar", + "-d", + vanilla_pg.connstr(), + "-D", + basebackup_dir, + ] + ) + + # Get start_lsn and end_lsn + with open(os.path.join(basebackup_dir, "backup_manifest")) as f: + manifest = json.load(f) + start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] + end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] + + def import_tar(base, wal): + env.neon_cli.raw_cli( + [ + "timeline", + "import", + "--tenant-id", + str(tenant), + "--timeline-id", + str(timeline), + "--node-name", + endpoint_id, + "--base-lsn", + start_lsn, + "--base-tarfile", + base, + "--end-lsn", + end_lsn, + "--wal-tarfile", + wal, + "--pg-version", + env.pg_version, + ] + ) + + # Importing correct backup works + import_tar(base_tar, wal_tar) + wait_for_last_record_lsn(ps_http, tenant, timeline, Lsn(end_lsn)) + + endpoint = env.endpoints.create_start( + endpoint_id, + tenant_id=tenant, + config_lines=[ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime='5 s'", + ], + ) + assert endpoint.safe_psql("select count(*) from t") == [(1,)] + + # Ok, consume + conn = endpoint.connect() + cur = conn.cursor() + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + + # Advance nextXid close to 2 billion XIDs + while True: + xid = int(query_scalar(cur, "SELECT txid_current()")) + log.info(f"xid now {xid}") + # Consume 10k transactons at a time until we get to 2^31 - 200k + if xid < 2 * 1024 * 1024 * 1024 - 100000: + cur.execute("select test_consume_xids(50000);") + elif xid < 2 * 1024 * 1024 * 1024 - 10000: + cur.execute("select test_consume_xids(5000);") + else: + break + + # Run a bunch of real INSERTs to cross over the 2 billion mark + # Use a begin-exception block to have a separate sub-XID for each insert. + cur.execute( + """ + do $$ + begin + for i in 1..10000 loop + -- Use a begin-exception block to generate a new subtransaction on each iteration + begin + insert into t values (i); + exception when others then + raise 'not expected %', sqlerrm; + end; + end loop; + end; + $$; + """ + ) + + # Also create a multi-XID with members past the 2 billion mark + conn2 = endpoint.connect() + cur2 = conn2.cursor() + cur.execute("INSERT INTO t VALUES ('x')") + cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;") + cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;") + cur.execute("COMMIT") + cur2.execute("COMMIT") + + # A checkpoint writes a WAL record with xl_xid=0. Many other WAL + # records would have the same effect. + cur.execute("checkpoint") + + # wait until pageserver receives that data + wait_for_wal_insert_lsn(env, endpoint, tenant, timeline) + + # Restart endpoint + endpoint.stop() + endpoint.start() + + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("SELECT count(*) from t") + assert cur.fetchone() == (10000 + 1 + 1,) diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index 9b0bab5125..f1dd3fb67d 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -16,11 +16,9 @@ from fixtures.utils import print_gc_result, query_scalar # def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC - neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) env.neon_cli.create_branch("test_old_request_lsn", "main") endpoint = env.endpoints.create_start("test_old_request_lsn") - log.info("postgres is running on test_old_request_lsn branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index af2d7aae88..4a25dfd874 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -3,23 +3,27 @@ import time from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor from typing import Any, DefaultDict, Dict, Tuple +import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + flush_ep_to_pageserver, last_flush_lsn_upload, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, wait_for_upload, wait_for_upload_queue_empty, + wait_until_tenant_active, ) -from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn +from fixtures.remote_storage import RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import query_scalar, wait_until @@ -165,6 +169,10 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline + #### + # Produce layers + #### + lsns = [] table_len = 10000 @@ -194,11 +202,28 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): # run checkpoint manually to be sure that data landed in remote storage client.timeline_checkpoint(tenant_id, timeline_id) - ##### Stop the first pageserver instance, erase all its data + # prevent new WAL from being produced, wait for layers to reach remote storage env.endpoints.stop_all() - - # wait until pageserver has successfully uploaded all the data to remote storage + for sk in env.safekeepers: + sk.stop() + # NB: the wait_for_upload returns as soon as remote_consistent_lsn == current_lsn. + # But the checkpoint also triggers a compaction + # => image layer generation => + # => doesn't advance LSN + # => but we want the remote state to deterministic, so additionally, wait for upload queue to drain wait_for_upload(client, tenant_id, timeline_id, current_lsn) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) + client.deletion_queue_flush(execute=True) + env.pageserver.stop() + env.pageserver.start() + # We've shut down the SKs, then restarted the PSes to sever all walreceiver connections; + # This means pageserver's remote_consistent_lsn is now frozen to whatever it was after the pageserver.stop() call. + wait_until_tenant_active(client, tenant_id) + + ### + # Produce layers complete; + # Start the actual testing. + ### def get_api_current_physical_size(): d = client.timeline_detail(tenant_id, timeline_id) @@ -215,9 +240,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" - # Wait until generated image layers are uploaded to S3 - wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id) - + # Stop the first pageserver instance, erase all its data env.pageserver.stop() # remove all the layer files @@ -312,6 +335,17 @@ def test_download_remote_layers_api( } ) + # This test triggers layer download failures on demand. It is possible to modify the failpoint + # during a `Timeline::get_vectored` right between the vectored read and it's validation read. + # This means that one of the reads can fail while the other one succeeds and vice versa. + # TODO(vlad): Remove this block once the vectored read path validation goes away. + env.pageserver.allowed_errors.extend( + [ + ".*initial_size_calculation.*Vectored get failed with downloading evicted layer file failed, but sequential get did not.*" + ".*initial_size_calculation.*Sequential get failed with downloading evicted layer file failed, but vectored get did not.*" + ] + ) + endpoint = env.endpoints.create_start("main") client = env.pageserver.http_client() @@ -370,7 +404,7 @@ def test_download_remote_layers_api( env.pageserver.allowed_errors.extend( [ ".*download failed: downloading evicted layer file failed.*", - f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed", + f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed.*downloading evicted layer file failed", ] ) @@ -497,7 +531,7 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: with endpoint.cursor() as cur: cur.execute("update a set id = -id") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) @@ -508,7 +542,6 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: for layer in layers.historic_layers: log.info(f"pre-compact: {layer}") - assert layer.layer_file_size is not None, "we must know layer file sizes" layer_sizes += layer.layer_file_size pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) @@ -547,6 +580,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne "image_creation_threshold": 100, # repartitioning parameter, unused "compaction_target_size": 128 * 1024**2, + # Always check if a new image layer can be created + "image_layer_creation_check_threshold": 0, # pitr_interval and gc_horizon are not interesting because we dont run gc } @@ -611,7 +646,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne # threshold to expose image creation to downloading all of the needed # layers -- threshold of 2 would sound more reasonable, but keeping it as 1 # to be less flaky - env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"}) + conf["image_creation_threshold"] = "1" + env.neon_cli.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()}) pageserver_http.timeline_compact(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) @@ -622,5 +658,200 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne assert dict(kinds_after) == {"Delta": 4, "Image": 1} +def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBuilder): + """ + Demonstrates that tenant shutdown will cancel on-demand download and secondary doing warmup. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + # turn off background tasks so that they don't interfere with the downloads + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + } + ) + client = env.pageserver.http_client() + failpoint = "before-downloading-layer-stream-pausable" + client.configure_failpoints((failpoint, "pause")) + + env.pageserver.allowed_errors.extend( + [ + ".*downloading failed, possibly for shutdown.*", + ] + ) + + info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(info.delta_layers()) == 1 + + layer = info.delta_layers()[0] + + client.tenant_heatmap_upload(env.initial_tenant) + + # evict the initdb layer so we can download it + client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name) + + with ThreadPoolExecutor(max_workers=2) as exec: + download = exec.submit( + client.download_layer, + env.initial_tenant, + env.initial_timeline, + layer.layer_file_name, + ) + + _, offset = wait_until( + 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + location_conf = {"mode": "Detached", "tenant_conf": {}} + # assume detach removes the layers + detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains( + "closing is taking longer than expected", offset + ), + ) + + client.configure_failpoints((failpoint, "off")) + + with pytest.raises( + PageserverApiException, match="downloading failed, possibly for shutdown" + ): + download.result() + + env.pageserver.assert_log_contains(".*downloading failed, possibly for shutdown.*") + + detach.result() + + client.configure_failpoints((failpoint, "pause")) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), + ) + + location_conf = { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + } + + client.tenant_location_conf(env.initial_tenant, location_conf) + + warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000) + + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset), + ) + + client.configure_failpoints((failpoint, "off")) + location_conf = {"mode": "Detached", "tenant_conf": {}} + client.tenant_location_conf(env.initial_tenant, location_conf) + + client.configure_failpoints((failpoint, "off")) + + # here we have nothing in the log, but we see that the warmup and conf location update worked + warmup.result() + + +def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder): + """ + Pause using a pausable_failpoint longer than the client timeout to simulate the timeout happening. + """ + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + assert isinstance(neon_env_builder.pageserver_remote_storage, S3Storage) + neon_env_builder.pageserver_remote_storage.custom_timeout = "1s" + + # turn off background tasks so that they don't interfere with the downloads + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + } + ) + client = env.pageserver.http_client() + failpoint = "before-downloading-layer-stream-pausable" + client.configure_failpoints((failpoint, "pause")) + + info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(info.delta_layers()) == 1 + + layer = info.delta_layers()[0] + + client.tenant_heatmap_upload(env.initial_tenant) + + # evict so we can download it + client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name) + + with ThreadPoolExecutor(max_workers=2) as exec: + download = exec.submit( + client.download_layer, + env.initial_tenant, + env.initial_timeline, + layer.layer_file_name, + ) + + _, offset = wait_until( + 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + # ensure enough time while paused to trip the timeout + time.sleep(2) + + client.configure_failpoints((failpoint, "off")) + download.result() + + _, offset = env.pageserver.assert_log_contains( + ".*failed, will retry \\(attempt 0\\): timeout.*" + ) + _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset) + + client.evict_layer(env.initial_tenant, env.initial_timeline, layer.layer_file_name) + + client.configure_failpoints((failpoint, "pause")) + + # capture the next offset for a new synchronization with the failpoint + _, offset = wait_until( + 20, + 0.5, + lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), + ) + + location_conf = { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + } + + client.tenant_location_conf( + env.initial_tenant, + location_conf, + ) + + started = time.time() + + warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000) + # ensure enough time while paused to trip the timeout + time.sleep(2) + + client.configure_failpoints((failpoint, "off")) + + warmup.result() + + elapsed = time.time() - started + + _, offset = env.pageserver.assert_log_contains( + ".*failed, will retry \\(attempt 0\\): timeout.*", offset + ) + _, offset = env.pageserver.assert_log_contains(".*succeeded after [0-9]+ retries.*", offset) + + assert elapsed < 30, "too long passed: {elapsed=}" + + def stringify(conf: Dict[str, Any]) -> Dict[str, str]: return dict(map(lambda x: (x[0], str(x[1])), conf.items())) diff --git a/test_runner/regress/test_ondemand_slru_download.py b/test_runner/regress/test_ondemand_slru_download.py new file mode 100644 index 0000000000..d6babe4393 --- /dev/null +++ b/test_runner/regress/test_ondemand_slru_download.py @@ -0,0 +1,161 @@ +from typing import Optional + +import pytest +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, tenant_get_shards +from fixtures.utils import query_scalar + + +# +# Test on-demand download of the pg_xact SLRUs +# +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_ondemand_download_pg_xact(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + tenant_conf = { + "lazy_slru_download": "true", + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count + ) + + timeline_id = env.initial_timeline + tenant_id = env.initial_tenant + endpoint = env.endpoints.create_start("main") + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Consume a lot of XIDs, to create more pg_xact segments + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (2)") + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (2)") + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (3)") + + # Restart postgres. After restart, the new instance will download the + # pg_xact segments lazily. + endpoint.stop() + endpoint.start() + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Consume more WAL, so that the pageserver can compact and GC older data, + # including the LSN that we started the new endpoint at, + cur.execute("CREATE TABLE anothertable (i int, t text)") + cur.execute( + "INSERT INTO anothertable SELECT g, 'long string to consume some space' || g FROM generate_series(1, 10000) g" + ) + + # Run GC + shards = tenant_get_shards(env, tenant_id, None) + for tenant_shard_id, pageserver in shards: + client = pageserver.http_client() + client.timeline_checkpoint(tenant_shard_id, timeline_id) + client.timeline_compact(tenant_shard_id, timeline_id) + client.timeline_gc(tenant_shard_id, timeline_id, 0) + + # Test that this can still on-demand download the old pg_xact segments + cur.execute("select xmin, xmax, * from clogtest") + tup = cur.fetchall() + log.info(f"tuples = {tup}") + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + tenant_conf = { + "lazy_slru_download": "true", + } + env = neon_env_builder.init_start( + initial_tenant_conf=tenant_conf, initial_tenant_shard_count=shard_count + ) + + endpoint = env.endpoints.create_start("main") + + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + cur.execute("CREATE EXTENSION neon_test_utils") + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Consume a lot of XIDs, to create more pg_xact segments + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + + # Open a new connection and insert another row, but leave + # the transaction open + pg_conn2 = endpoint.connect() + cur2 = pg_conn2.cursor() + cur2.execute("BEGIN") + cur2.execute("INSERT INTO clogtest VALUES (2)") + + # Another insert on the first connection, which is committed. + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + cur.execute("INSERT INTO clogtest VALUES (3)") + + # Start standby at this point in time + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + endpoint_at_lsn = env.endpoints.create_start( + branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn + ) + + # Commit transaction 2, after the standby was launched. + cur2.execute("COMMIT") + + # The replica should not see transaction 2 as committed. + conn_replica = endpoint_at_lsn.connect() + cur_replica = conn_replica.cursor() + cur_replica.execute("SELECT * FROM clogtest") + assert cur_replica.fetchall() == [(1,), (3,)] + + +def test_ondemand_download_after_wal_switch(neon_env_builder: NeonEnvBuilder): + """ + Test on-demand SLRU download on standby, when starting right after + WAL segment switch. + + This is a repro for a bug in how the LSN at WAL page/segment + boundary was handled (https://github.com/neondatabase/neon/issues/8030) + """ + + tenant_conf = { + "lazy_slru_download": "true", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + endpoint = env.endpoints.create_start("main") + pg_conn = endpoint.connect() + cur = pg_conn.cursor() + + # Create a test table + cur.execute("CREATE TABLE clogtest (id integer)") + cur.execute("INSERT INTO clogtest VALUES (1)") + + # Start standby at WAL segment boundary + cur.execute("SELECT pg_switch_wal()") + lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")) + _endpoint_at_lsn = env.endpoints.create_start( + branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn + ) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index e29db1e252..abbea59113 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -2,20 +2,22 @@ import subprocess from pathlib import Path from typing import Optional +import toml +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverHttpClient -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until -# test that we cannot override node id after init -def test_pageserver_init_node_id( - neon_simple_env: NeonEnv, neon_binpath: Path, pg_distrib_dir: Path -): +def test_pageserver_init_node_id(neon_simple_env: NeonEnv, neon_binpath: Path): + """ + NB: The neon_local doesn't use `--init` mode anymore, but our production + deployment still does => https://github.com/neondatabase/aws/pull/1322 + """ workdir = neon_simple_env.pageserver.workdir pageserver_config = workdir / "pageserver.toml" pageserver_bin = neon_binpath / "pageserver" @@ -29,36 +31,47 @@ def test_pageserver_init_node_id( stderr=subprocess.PIPE, ) - # remove initial config and stop existing pageserver - pageserver_config.unlink() neon_simple_env.pageserver.stop() - bad_init = run_pageserver(["--init", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']) + with open(neon_simple_env.pageserver.config_toml_path, "r") as f: + ps_config = toml.load(f) + + required_config_keys = [ + "pg_distrib_dir", + "listen_pg_addr", + "listen_http_addr", + "pg_auth_type", + "http_auth_type", + # TODO: only needed for NEON_PAGESERVER_PANIC_ON_UNSPECIFIED_COMPACTION_ALGORITHM in https://github.com/neondatabase/neon/pull/7748 + # "tenant_config", + ] + required_config_overrides = [ + f"--config-override={toml.dumps({k: ps_config[k]})}" for k in required_config_keys + ] + + pageserver_config.unlink() + + bad_init = run_pageserver(["--init", *required_config_overrides]) assert ( bad_init.returncode == 1 ), "pageserver should not be able to init new config without the node id" - assert "missing id" in bad_init.stderr + assert 'missing config value "id"' in bad_init.stderr assert not pageserver_config.exists(), "config file should not be created after init error" - completed_init = run_pageserver( - ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] - ) + good_init_cmd = [ + "--init", + f"--config-override=id={ps_config['id']}", + *required_config_overrides, + ] + completed_init = run_pageserver(good_init_cmd) assert ( completed_init.returncode == 0 ), "pageserver should be able to create a new config with the node id given" assert pageserver_config.exists(), "config file should be created successfully" - bad_reinit = run_pageserver( - ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"'] - ) - assert ( - bad_reinit.returncode == 1 - ), "pageserver should not be able to init new config without the node id" - assert "already exists, cannot init it" in bad_reinit.stderr - - bad_update = run_pageserver(["--update-config", "-c", "id = 3"]) - assert bad_update.returncode == 1, "pageserver should not allow updating node id" - assert "has node id already, it cannot be overridden" in bad_update.stderr + bad_reinit = run_pageserver(good_init_cmd) + assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists" + assert "config file already exists" in bad_reinit.stderr def check_client(env: NeonEnv, client: PageserverHttpClient): @@ -73,7 +86,7 @@ def check_client(env: NeonEnv, client: PageserverHttpClient): # create new tenant and check it is also there tenant_id = TenantId.generate() client.tenant_create( - tenant_id, generation=env.attachment_service.attach_hook_issue(tenant_id, env.pageserver.id) + tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id) ) assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} diff --git a/test_runner/regress/test_pageserver_crash_consistency.py b/test_runner/regress/test_pageserver_crash_consistency.py new file mode 100644 index 0000000000..2d6b50490e --- /dev/null +++ b/test_runner/regress/test_pageserver_crash_consistency.py @@ -0,0 +1,108 @@ +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.pageserver.common_types import ImageLayerName, parse_layer_file_name +from fixtures.pageserver.utils import ( + wait_for_last_record_lsn, + wait_until_tenant_active, +) +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind +from requests.exceptions import ConnectionError + + +def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + """ + Test case for docs/rfcs/027-crash-consistent-layer-map-through-index-part.md. + + Simulate crash after compaction has written layers to disk + but before they have been uploaded/linked into remote index_part.json. + + Startup handles this situation by deleting the not yet uploaded L1 layer files. + """ + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": f"{10 * 1024**2}", + "compaction_period": "0 s", + "compaction_threshold": "999999", + } + ) + pageserver_http = env.pageserver.http_client() + + tenant_id, timeline_id = env.initial_tenant, env.initial_timeline + + pageserver_http.configure_failpoints(("after-timeline-compacted-first-L1", "exit")) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + connstr = endpoint.connstr(options="-csynchronous_commit=off") + pg_bin.run_capture(["pgbench", "-i", "-s1", connstr]) + + lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + + # make sure we receive no new wal after this, so that we'll write over the same L1 file. + endpoint.stop() + for sk in env.safekeepers: + sk.stop() + + pageserver_http.patch_tenant_config_client_side(tenant_id, {"compaction_threshold": 3}) + # hit the exit failpoint + with pytest.raises(ConnectionError, match="Remote end closed connection without response"): + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + env.pageserver.stop() + + # now the duplicate L1 has been created, but is not yet uploaded + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + # path = env.remote_storage.timeline_path(tenant_id, timeline_id) + l1_found = None + for path in env.pageserver.list_layers(tenant_id, timeline_id): + [key_range, lsn_range] = path.name.split("__", maxsplit=1) + + if "-" not in lsn_range: + # image layer + continue + + [key_start, key_end] = key_range.split("-", maxsplit=1) + + if key_start == "0" * 36 and key_end == "F" * 36: + # L0 + continue + + candidate = parse_layer_file_name(path.name) + + if isinstance(candidate, ImageLayerName): + continue + + if l1_found is not None: + raise RuntimeError(f"found multiple L1: {l1_found.to_str()} and {path.name}") + + l1_found = candidate + + assert l1_found is not None, "failed to find L1 locally" + + uploaded = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, l1_found.to_str() + ) + assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded" + + env.pageserver.start() + wait_until_tenant_active(pageserver_http, tenant_id) + + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, l1_found + ), "partial compaction result should had been removed during startup" + + # wait for us to catch up again + wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn) + + pageserver_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True) + + assert env.pageserver.layer_exists(tenant_id, timeline_id, l1_found), "the L1 reappears" + + uploaded = env.pageserver_remote_storage.remote_layer_path( + tenant_id, timeline_id, l1_found.to_str() + ) + assert uploaded.exists(), "the L1 is uploaded" + + +# TODO: same test for L0s produced by ingest. diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 87a4fa01fc..696af24e5c 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -9,21 +9,23 @@ of the pageserver are: - Updates to remote_consistent_lsn may only be made visible after validating generation """ - import enum +import os import re import time from typing import Optional import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, - S3Scrubber, - last_flush_lsn_upload, + StorageScrubber, + generate_uploads_and_deletions, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( assert_tenant_state, @@ -34,8 +36,7 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.types import TenantId, TimelineId -from fixtures.utils import print_gc_result, wait_until +from fixtures.utils import wait_until from fixtures.workload import Workload # A tenant configuration that is convenient for generating uploads and deletions @@ -52,68 +53,10 @@ TENANT_CONF = { "compaction_period": "0s", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", } -def generate_uploads_and_deletions( - env: NeonEnv, - *, - init: bool = True, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - data: Optional[str] = None, -): - """ - Using the environment's default tenant + timeline, generate a load pattern - that results in some uploads and some deletions to remote storage. - """ - - if tenant_id is None: - tenant_id = env.initial_tenant - assert tenant_id is not None - - if timeline_id is None: - timeline_id = env.initial_timeline - assert timeline_id is not None - - ps_http = env.pageserver.http_client() - - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - if init: - endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)") - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - - def churn(data): - endpoint.safe_psql_many( - [ - f""" - INSERT INTO foo (id, val) - SELECT g, '{data}' - FROM generate_series(1, 200) g - ON CONFLICT (id) DO UPDATE - SET val = EXCLUDED.val - """, - # to ensure that GC can actually remove some layers - "VACUUM foo", - ] - ) - assert tenant_id is not None - assert timeline_id is not None - # We are waiting for uploads as well as local flush, in order to avoid leaving the system - # in a state where there are "future layers" in remote storage that will generate deletions - # after a restart. - last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - ps_http.timeline_checkpoint(tenant_id, timeline_id) - - # Compaction should generate some GC-elegible layers - for i in range(0, 2): - churn(f"{i if data is None else data}") - - gc_result = ps_http.timeline_gc(tenant_id, timeline_id, 0) - print_gc_result(gc_result) - assert gc_result["layers_removed"] > 0 - - def read_all( env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None ): @@ -195,14 +138,24 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): env.broker.try_start() for sk in env.safekeepers: sk.start() - env.attachment_service.start() + env.storage_controller.start() - env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',)) + # We will start a pageserver with no control_plane_api set, so it won't be able to self-register + env.storage_controller.node_register(env.pageserver) + + replaced_config = env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_api": "", + } + ) + env.pageserver.start() + env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) env.neon_cli.create_tenant( tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline ) - generate_uploads_and_deletions(env) + + generate_uploads_and_deletions(env, pageserver=env.pageserver) def parse_generation_suffix(key): m = re.match(".+-([0-9a-zA-Z]{8})$", key) @@ -213,23 +166,34 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): log.info(f"group: {m.group(1)}") return int(m.group(1), 16) + assert neon_env_builder.pageserver_remote_storage is not None pre_upgrade_keys = list( - [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]] + [ + o["Key"] + for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[ + "Contents" + ] + ] ) for key in pre_upgrade_keys: assert parse_generation_suffix(key) is None env.pageserver.stop() - # Starting without the override that disabled control_plane_api + env.pageserver.patch_config_toml_nonrecursive(replaced_config) env.pageserver.start() - generate_uploads_and_deletions(env, init=False) + generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) legacy_objects: list[str] = [] suffixed_objects = [] post_upgrade_keys = list( - [o["Key"] for o in list_prefix(neon_env_builder, delimiter="")["Contents"]] + [ + o["Key"] + for o in list_prefix(neon_env_builder.pageserver_remote_storage, delimiter="")[ + "Contents" + ] + ] ) for key in post_upgrade_keys: log.info(f"post-upgrade key: {key}") @@ -251,9 +215,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = S3Scrubber( - neon_env_builder.test_output_dir, neon_env_builder - ).scan_metadata() + metadata_summary = StorageScrubber(neon_env_builder).scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 @@ -265,12 +227,16 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - some_other_pageserver = 1234 - ps_http = env.pageserver.http_client() + attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"] + main_pageserver = env.get_pageserver(attached_to_id) + other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0] - generate_uploads_and_deletions(env) + ps_http = main_pageserver.http_client() + + generate_uploads_and_deletions(env, pageserver=main_pageserver) # Flush: pending deletions should all complete assert_deletion_queue(ps_http, lambda n: n > 0) @@ -283,14 +249,14 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"] assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 - env.pageserver.allowed_errors.extend( + main_pageserver.allowed_errors.extend( [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] ) # Now advance the generation in the control plane: subsequent validations # from the running pageserver will fail. No more deletions should happen. - env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) - generate_uploads_and_deletions(env, init=False) + env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id) + generate_uploads_and_deletions(env, init=False, pageserver=main_pageserver) assert_deletion_queue(ps_http, lambda n: n > 0) queue_depth_before = get_deletion_queue_depth(ps_http) @@ -342,9 +308,14 @@ def test_deletion_queue_recovery( neon_env_builder.enable_pageserver_remote_storage( RemoteStorageKind.MOCK_S3, ) + neon_env_builder.num_pageservers = 2 env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - ps_http = env.pageserver.http_client() + attached_to_id = env.storage_controller.locate(env.initial_tenant)[0]["node_id"] + main_pageserver = env.get_pageserver(attached_to_id) + other_pageserver = [p for p in env.pageservers if p.id != attached_to_id][0] + + ps_http = main_pageserver.http_client() failpoints = [ # Prevent deletion lists from being executed, to build up some backlog of deletions @@ -354,14 +325,13 @@ def test_deletion_queue_recovery( if validate_before == ValidateBefore.NO_VALIDATE: failpoints.append( # Prevent deletion lists from being validated, we will test that they are - # dropped properly during recovery. 'pause' is okay here because we kill - # the pageserver with immediate=true - ("control-plane-client-validate", "pause") + # dropped properly during recovery. This is such a long sleep as to be equivalent to "never" + ("control-plane-client-validate", "return(3600000)") ) ps_http.configure_failpoints(failpoints) - generate_uploads_and_deletions(env) + generate_uploads_and_deletions(env, pageserver=main_pageserver) # There should be entries in the deletion queue assert_deletion_queue(ps_http, lambda n: n > 0) @@ -388,7 +358,7 @@ def test_deletion_queue_recovery( # also wait to see the header hit the disk: this seems paranoid but the race # can really happen on a heavily overloaded test machine. def assert_header_written(): - assert (env.pageserver.workdir / "deletion" / "header-01").exists() + assert (main_pageserver.workdir / "deletion" / "header-01").exists() wait_until(20, 1, assert_header_written) @@ -398,15 +368,15 @@ def test_deletion_queue_recovery( before_restart_depth = get_deletion_queue_validated(ps_http) log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued") - env.pageserver.stop(immediate=True) + main_pageserver.stop(immediate=True) if keep_attachment == KeepAttachment.LOSE: - some_other_pageserver = 101010 - env.attachment_service.attach_hook_issue(env.initial_tenant, some_other_pageserver) + some_other_pageserver = other_pageserver.id + env.storage_controller.attach_hook_issue(env.initial_tenant, some_other_pageserver) - env.pageserver.start() + main_pageserver.start() - def assert_deletions_submitted(n: int): + def assert_deletions_submitted(n: int) -> None: assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n # After restart, issue a flush to kick the deletion frontend to do recovery. @@ -427,7 +397,7 @@ def test_deletion_queue_recovery( # validated before restart. assert get_deletion_queue_executed(ps_http) == before_restart_depth else: - env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) + main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) # If we lost the attachment, we should have dropped our pre-restart deletions. assert get_deletion_queue_dropped(ps_http) == before_restart_depth @@ -436,8 +406,8 @@ def test_deletion_queue_recovery( assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 # Restart again - env.pageserver.stop(immediate=True) - env.pageserver.start() + main_pageserver.stop(immediate=True) + main_pageserver.start() # No deletion lists should be recovered: this demonstrates that deletion lists # were cleaned up after being executed or dropped in the previous process lifetime. @@ -456,7 +426,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ps_http = env.pageserver.http_client() - generate_uploads_and_deletions(env) + generate_uploads_and_deletions(env, pageserver=env.pageserver) env.pageserver.allowed_errors.extend( [ @@ -468,12 +438,12 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ) # Simulate a major incident: the control plane goes offline - env.attachment_service.stop() + env.storage_controller.stop() # Remember how many validations had happened before the control plane went offline validated = get_deletion_queue_validated(ps_http) - generate_uploads_and_deletions(env, init=False) + generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) # The running pageserver should stop progressing deletions time.sleep(10) @@ -483,12 +453,15 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # incident, but it might be unavoidable: if so, we want to be able to start up # and serve clients. env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP - env.pageserver.start( - overrides=("--pageserver-config-override=control_plane_emergency_mode=true",) + replaced = env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_emergency_mode": True, + } ) + env.pageserver.start() # The pageserver should provide service to clients - generate_uploads_and_deletions(env, init=False) + generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) # The pageserver should neither validate nor execute any deletions, it should have # loaded the DeletionLists from before though @@ -498,7 +471,7 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): assert get_deletion_queue_executed(ps_http) == 0 # When the control plane comes back up, normal service should resume - env.attachment_service.start() + env.storage_controller.start() ps_http.deletion_queue_flush(execute=True) assert get_deletion_queue_depth(ps_http) == 0 @@ -507,9 +480,10 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # The pageserver should work fine when subsequently restarted in non-emergency mode env.pageserver.stop() # Non-immediate: implicitly checking that shutdown doesn't hang waiting for CP + env.pageserver.patch_config_toml_nonrecursive(replaced) env.pageserver.start() - generate_uploads_and_deletions(env, init=False) + generate_uploads_and_deletions(env, init=False, pageserver=env.pageserver) ps_http.deletion_queue_flush(execute=True) assert get_deletion_queue_depth(ps_http) == 0 assert get_deletion_queue_validated(ps_http) > 0 @@ -547,7 +521,7 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - generate_uploads_and_deletions(env) + generate_uploads_and_deletions(env, pageserver=env.pageserver) read_all(env, tenant_id, timeline_id) evict_all_layers(env, tenant_id, timeline_id) @@ -649,3 +623,96 @@ def test_multi_attach( # All data we wrote while multi-attached remains readable workload.validate(pageservers[2].id) + + +def test_upgrade_generationless_local_file_paths( + neon_env_builder: NeonEnvBuilder, +): + """ + Test pageserver behavior when startup up with local layer paths without + generation numbers: it should accept these layer files, and avoid doing + a delete/download cycle on them. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' + ) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(1000) + + attached_pageserver = env.get_tenant_pageserver(tenant_id) + secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[ + 0 + ] + + attached_pageserver.http_client().tenant_heatmap_upload(tenant_id) + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) + + # Rename the local paths to legacy format, to simulate what + # we would see when upgrading. Do this on both attached and secondary locations, as we will + # test the behavior of both. + for pageserver in env.pageservers: + pageserver.stop() + timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id) + files_renamed = 0 + for filename in os.listdir(timeline_dir): + path = os.path.join(timeline_dir, filename) + log.info(f"Found file {path}") + if path.endswith("-v1-00000001"): + new_path = path[:-12] + os.rename(path, new_path) + log.info(f"Renamed {path} -> {new_path}") + files_renamed += 1 + + assert files_renamed > 0 + + pageserver.start() + + workload.validate() + + # Assert that there were no on-demand downloads + assert ( + attached_pageserver.http_client().get_metric_value( + "pageserver_remote_ondemand_downloaded_layers_total" + ) + == 0 + ) + + # Do a secondary download and ensure there were no layer downloads + secondary_pageserver.http_client().tenant_secondary_download(tenant_id) + assert ( + secondary_pageserver.http_client().get_metric_value( + "pageserver_secondary_download_layer_total" + ) + == 0 + ) + + # Check that when we evict and promote one of the legacy-named layers, everything works as + # expected + local_layers = list( + ( + parse_layer_file_name(path.name), + os.path.join(attached_pageserver.timeline_dir(tenant_id, timeline_id), path), + ) + for path in attached_pageserver.list_layers(tenant_id, timeline_id) + ) + (victim_layer_name, victim_path) = local_layers[0] + assert os.path.exists(victim_path) + + attached_pageserver.http_client().evict_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + assert not os.path.exists(victim_path) + + attached_pageserver.http_client().download_layer( + tenant_id, timeline_id, victim_layer_name.to_str() + ) + # We should download into the same local path we started with + assert os.path.exists(victim_path) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py new file mode 100644 index 0000000000..111285b40c --- /dev/null +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -0,0 +1,118 @@ +import json +import uuid + +from anyio import Path +from fixtures.common_types import TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin +from fixtures.pg_version import PgVersion +from fixtures.utils import wait_until + + +def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + env = neon_env_builder.init_start() + + env.pageserver.tenant_detach(env.initial_tenant) + + env.pageserver.allowed_errors.append( + # https://github.com/neondatabase/neon/issues/6925 + r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" + ) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + rate_limit_rps = 100 + compaction_period = 5 + env.pageserver.tenant_create( + tenant_id, + conf={ + "compaction_period": f"{compaction_period}s", + "timeline_get_throttle": { + "task_kinds": ["PageRequestHandler"], + "initial": 0, + "refill_interval": "100ms", + "refill_amount": int(rate_limit_rps / 10), + "max": int(rate_limit_rps / 10), + "fair": True, + }, + }, + ) + + ps_http = env.pageserver.http_client() + + ps_http.timeline_create(PgVersion.V16, tenant_id, timeline_id) + + def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: int): + cmd = [ + str(env.neon_binpath / "pagebench"), + "get-page-latest-lsn", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--runtime", + f"{duration_secs}s", + f"{tenant_id}/{timeline_id}", + ] + + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + return int(results["total"]["request_count"]) + + log.info("warmup / make sure metrics are present") + run_pagebench_at_max_speed_and_get_total_requests_completed(2) + metrics_query = { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "smgr_query_type": "get_page_at_lsn", + } + metric_name = "pageserver_smgr_query_seconds_sum" + smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query) + assert smgr_query_seconds_pre is not None + + marker = uuid.uuid4().hex + ps_http.post_tracing_event("info", marker) + _, marker_offset = wait_until( + 10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None) + ) + + log.info("run pagebench") + duration_secs = 10 + actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs) + + log.info("validate the client is capped at the configured rps limit") + expect_ncompleted = duration_secs * rate_limit_rps + delta_abs = abs(expect_ncompleted - actual_ncompleted) + threshold = 0.05 * expect_ncompleted + assert ( + threshold / rate_limit_rps < 0.1 * duration_secs + ), "test self-test: unrealistic expecations regarding precision in this test" + assert ( + delta_abs < 0.05 * expect_ncompleted + ), "the throttling deviates more than 5percent from the expectation" + + log.info("validate that we logged the throttling") + + wait_until( + 10, + compaction_period / 10, + lambda: env.pageserver.assert_log_contains( + f".*{tenant_id}.*shard was throttled in the last n_seconds.*", + offset=marker_offset, + ), + ) + + log.info("validate that the metric doesn't include throttle wait time") + smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query) + assert smgr_query_seconds_post is not None + actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre + + assert ( + duration_secs >= 10 * actual_smgr_query_seconds + ), "smgr metrics should not include throttle wait time" diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py new file mode 100644 index 0000000000..66b6185aaa --- /dev/null +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -0,0 +1,313 @@ +import asyncio +import os +import time +from typing import Optional, Tuple + +import psutil +import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + tenant_get_shards, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.utils import wait_until + +TIMELINE_COUNT = 10 +ENTRIES_PER_TIMELINE = 10_000 +CHECKPOINT_TIMEOUT_SECONDS = 60 + + +async def run_worker_for_tenant( + env: NeonEnv, entries: int, tenant: TenantId, offset: Optional[int] = None +) -> Lsn: + if offset is None: + offset = 0 + + with env.endpoints.create_start("main", tenant_id=tenant) as ep: + conn = await ep.connect_async() + try: + await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)") + await conn.execute( + f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series({offset},{entries}) as i" + ) + finally: + await conn.close(timeout=10) + + last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + return last_flush_lsn + + +async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: + tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) + last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) + return tenant, timeline, last_flush_lsn + + +async def workload( + env: NeonEnv, tenant_conf, timelines: int, entries: int +) -> list[Tuple[TenantId, TimelineId, Lsn]]: + workers = [asyncio.create_task(run_worker(env, tenant_conf, entries)) for _ in range(timelines)] + return await asyncio.gather(*workers) + + +def wait_until_pageserver_is_caught_up( + env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]] +): + for tenant, timeline, last_flush_lsn in last_flush_lsns: + shards = tenant_get_shards(env, tenant) + for tenant_shard_id, pageserver in shards: + waited = wait_for_last_record_lsn( + pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn + ) + assert waited >= last_flush_lsn + + +def wait_until_pageserver_has_uploaded( + env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]] +): + for tenant, timeline, last_flush_lsn in last_flush_lsns: + shards = tenant_get_shards(env, tenant) + for tenant_shard_id, pageserver in shards: + wait_for_upload(pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn) + + +def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float: + def query(): + value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total") + assert value is not None + return value + + # The metric gets initialised on the first update. + # Retry a few times, but return 0 if it's stable. + try: + return float(wait_until(3, 0.5, query)) + except Exception: + return 0 + + +def get_dirty_bytes(env): + v = env.pageserver.http_client().get_metric_value("pageserver_timeline_ephemeral_bytes") or 0 + log.info(f"dirty_bytes: {v}") + return v + + +def assert_dirty_bytes(env, v): + assert get_dirty_bytes(env) == v + + +def assert_dirty_bytes_nonzero(env): + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes > 0 + return dirty_bytes + + +@pytest.mark.parametrize("immediate_shutdown", [True, False]) +def test_pageserver_small_inmemory_layers( + neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool +): + """ + Test that open layers get flushed after the `checkpoint_timeout` config + and do not require WAL reingest upon restart. + + The workload creates a number of timelines and writes some data to each, + but not enough to trigger flushes via the `checkpoint_distance` config. + """ + tenant_conf = { + # Large `checkpoint_distance` effectively disables size + # based checkpointing. + "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", + "compaction_period": "1s", + } + + env = neon_env_builder.init_configs() + env.start() + + last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + ps_http_client = env.pageserver.http_client() + total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) + + # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, + # such that there are zero bytes of ephemeral layer left on the pageserver + log.info("Waiting for background checkpoints...") + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + + # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they + # must be uploaded to remain visible to the pageserver after restart. + wait_until_pageserver_has_uploaded(env, last_flush_lsns) + + env.pageserver.restart(immediate=immediate_shutdown) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + # Catching up with WAL ingest should have resulted in zero bytes of ephemeral layers, since + # we froze, flushed and uploaded everything before restarting. There can be no more WAL writes + # because we shut down compute endpoints before flushing. + assert get_dirty_bytes(env) == 0 + + total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client) + + log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}") + log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}") + + assert total_wal_ingested_after_restart == 0 + + +def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): + """ + Test that `checkpoint_timeout` is enforced even if there is no safekeeper input. + """ + tenant_conf = { + # Large `checkpoint_distance` effectively disables size + # based checkpointing. + "checkpoint_distance": f"{2 * 1024 ** 3}", + "checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s", + "compaction_period": "1s", + } + + env = neon_env_builder.init_configs() + env.start() + + last_flush_lsns = asyncio.run(workload(env, tenant_conf, TIMELINE_COUNT, ENTRIES_PER_TIMELINE)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. + wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # Stop the safekeepers, so that we cannot have any more WAL receiver connections + for sk in env.safekeepers: + sk.stop() + + # We should have got here fast enough that we didn't hit the background interval yet, + # and the teardown of SK connections shouldn't prompt any layer freezing. + assert get_dirty_bytes(env) > 0 + + # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, + # such that there are zero bytes of ephemeral layer left on the pageserver + log.info("Waiting for background checkpoints...") + wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) # type: ignore + + # The code below verifies that we do not flush on the first write + # after an idle period longer than the checkpoint timeout. + + # Sit quietly for longer than the checkpoint timeout + time.sleep(CHECKPOINT_TIMEOUT_SECONDS + CHECKPOINT_TIMEOUT_SECONDS / 2) + + # Restart the safekeepers and write a bit of extra data into one tenant + for sk in env.safekeepers: + sk.start() + + tenant_with_extra_writes = last_flush_lsns[0][0] + asyncio.run( + run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) + ) + + dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) # type: ignore + + # We shouldn't flush since we've just opened a new layer + waited_for = 0 + while waited_for < CHECKPOINT_TIMEOUT_SECONDS // 4: + time.sleep(5) + waited_for += 5 + + assert get_dirty_bytes(env) >= dirty_after_write + + +@pytest.mark.skipif( + # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is + # prohibitively slow in debug mode + os.getenv("BUILD_TYPE") == "debug", + reason="Avoid running bulkier ingest tests in debug mode", +) +def test_total_size_limit(neon_env_builder: NeonEnvBuilder): + """ + Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is + individually exceeding checkpoint thresholds. + """ + + system_memory = psutil.virtual_memory().total + + # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on + # a system with 128GB of RAM). We will then write enough data to violate this limit. + max_dirty_data = 128 * 1024 * 1024 + ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory + assert ephemeral_bytes_per_memory_kb > 0 + + neon_env_builder.pageserver_config_override = f""" + ephemeral_bytes_per_memory_kb={ephemeral_bytes_per_memory_kb} + """ + + compaction_period_s = 10 + + tenant_conf = { + # Large space + time thresholds: effectively disable these limits + "checkpoint_distance": f"{1024 ** 4}", + "checkpoint_timeout": "3600s", + "compaction_period": f"{compaction_period_s}s", + } + + env = neon_env_builder.init_configs() + env.start() + + timeline_count = 10 + + # This is about 2MiB of data per timeline + entries_per_timeline = 100_000 + + last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline)) + wait_until_pageserver_is_caught_up(env, last_flush_lsns) + + total_bytes_ingested = 0 + for tenant, timeline, last_flush_lsn in last_flush_lsns: + http_client = env.pageserver.http_client() + initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"]) + total_bytes_ingested += last_flush_lsn - initdb_lsn + + log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})") + assert total_bytes_ingested > max_dirty_data + + # Expected end state: the total physical size of all the tenants is in excess of the max dirty + # data, but the total amount of dirty data is less than the limit: this demonstrates that we + # have exceeded the threshold but then rolled layers in response + def get_total_historic_layers(): + total_ephemeral_layers = 0 + total_historic_bytes = 0 + for tenant, timeline, _last_flush_lsn in last_flush_lsns: + http_client = env.pageserver.http_client() + initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"]) + layer_map = http_client.layer_map_info(tenant, timeline) + total_historic_bytes += sum( + layer.layer_file_size + for layer in layer_map.historic_layers + if Lsn(layer.lsn_start) > initdb_lsn + ) + total_ephemeral_layers += len(layer_map.in_memory_layers) + + log.info( + f"Total historic layer bytes: {total_historic_bytes} ({total_ephemeral_layers} ephemeral layers)" + ) + + return total_historic_bytes + + def assert_bytes_rolled(): + assert total_bytes_ingested - get_total_historic_layers() <= max_dirty_data + + # Wait until enough layers have rolled that the amount of dirty data is under the threshold. + # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing + # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit. + wait_until(compaction_period_s * 2, 1, assert_bytes_rolled) + + # The end state should also have the reported metric under the limit + def assert_dirty_data_limited(): + dirty_bytes = get_dirty_bytes(env) + assert dirty_bytes < max_dirty_data + + wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) # type: ignore diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 042961baa5..cea35a6acb 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -1,17 +1,23 @@ +import gzip import json +import os import time from dataclasses import dataclass from pathlib import Path from queue import SimpleQueue from typing import Any, Dict, Set +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) -from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TenantId, TimelineId +from fixtures.remote_storage import ( + LocalFsStorage, + RemoteStorageKind, + remote_storage_to_toml_inline_table, +) from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -40,6 +46,9 @@ def test_metric_collection( uploads.put((events, is_last == "true")) return Response(status=200) + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + assert neon_env_builder.pageserver_remote_storage is not None + # Require collecting metrics frequently, since we change # the timeline and want something to be logged about it. # @@ -48,12 +57,11 @@ def test_metric_collection( neon_env_builder.pageserver_config_override = f""" metric_collection_interval="1s" metric_collection_endpoint="{metric_collection_endpoint}" + metric_collection_bucket={remote_storage_to_toml_inline_table(neon_env_builder.pageserver_remote_storage)} cached_metric_collection_interval="0s" synthetic_size_calculation_interval="3s" """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}") # mock http server that returns OK for the metrics @@ -67,9 +75,7 @@ def test_metric_collection( env.pageserver.allowed_errors.extend( [ ".*metrics endpoint refused the sent metrics*", - # we have a fast rate of calculation, these can happen at shutdown - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ".*metrics_collection: failed to upload to S3: Failed to upload data of length .* to storage path.*", ] ) @@ -166,6 +172,20 @@ def test_metric_collection( httpserver.check() + # Check that at least one bucket output object is present, and that all + # can be decompressed and decoded. + bucket_dumps = {} + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root): + for file in files: + file_path = os.path.join(dirpath, file) + log.info(file_path) + if file.endswith(".gz"): + bucket_dumps[file_path] = json.load(gzip.open(file_path)) + + assert len(bucket_dumps) >= 1 + assert all("events" in data for data in bucket_dumps.values()) + def test_metric_collection_cleans_up_tempfile( httpserver: HTTPServer, @@ -215,9 +235,6 @@ def test_metric_collection_cleans_up_tempfile( env.pageserver.allowed_errors.extend( [ ".*metrics endpoint refused the sent metrics*", - # we have a fast rate of calculation, these can happen at shutdown - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", ] ) diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py new file mode 100644 index 0000000000..aecfcdd262 --- /dev/null +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -0,0 +1,42 @@ +import threading +import time +from contextlib import closing + +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, PgBin + + +# Test updating neon.pageserver_connstring setting on the fly. +# +# This merely changes some whitespace in the connection string, so +# this doesn't prove that the new string actually takes effect. But at +# least the code gets exercised. +def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin): + env = neon_simple_env + env.neon_cli.create_branch("test_pageserver_restarts") + endpoint = env.endpoints.create_start("test_pageserver_restarts") + n_reconnects = 1000 + timeout = 0.01 + scale = 10 + + def run_pgbench(connstr: str): + log.info(f"Start a pgbench workload on pg {connstr}") + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr]) + pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr]) + + thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True) + thread.start() + + with closing(endpoint.connect()) as con: + with con.cursor() as c: + c.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'") + connstring = c.fetchall()[0][0] + for i in range(n_reconnects): + time.sleep(timeout) + c.execute( + "alter system set neon.pageserver_connstring=%s", + (connstring + (" " * (i % 2)),), + ) + c.execute("select pg_reload_conf()") + + thread.join() diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index c4499196b5..4ce53df214 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -1,4 +1,6 @@ +import random from contextlib import closing +from typing import Optional import pytest from fixtures.log_helper import log @@ -18,7 +20,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): endpoint = env.endpoints.create_start("main") pageserver_http = env.pageserver.http_client() - assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) + == 1 + ) pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -53,7 +58,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): env.pageserver.start() # We reloaded our tenant - assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + pageserver_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) + == 1 + ) cur.execute("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) @@ -141,18 +149,19 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): # Test that repeatedly kills and restarts the page server, while the # safekeeper and compute node keep running. @pytest.mark.timeout(540) -def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str): +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_pageserver_chaos( + neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int] +): if build_type == "debug": pytest.skip("times out in debug builds") neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count - env = neon_env_builder.init_start() - - # these can happen, if we shutdown at a good time. to be fixed as part of #5172. - message = ".*duplicated L1 layer layer=.*" - env.pageserver.allowed_errors.append(message) + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) # Use a tiny checkpoint distance, to create a lot of layers quickly. # That allows us to stress the compaction and layer flushing logic more. @@ -192,13 +201,19 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str): log.info(f"shared_buffers is {row[0]}, table size {row[1]}") assert int(row[0]) < int(row[1]) + # We run "random" kills using a fixed seed, to improve reproducibility if a test + # failure is related to a particular order of operations. + seed = 0xDEADBEEF + rng = random.Random(seed) + # Update the whole table, then immediately kill and restart the pageserver for i in range(1, 15): endpoint.safe_psql("UPDATE foo set updates = updates + 1") # This kills the pageserver immediately, to simulate a crash - env.pageserver.stop(immediate=True) - env.pageserver.start() + to_kill = rng.choice(env.pageservers) + to_kill.stop(immediate=True) + to_kill.start() # Check that all the updates are visible num_updates = endpoint.safe_psql("SELECT sum(updates) FROM foo")[0][0] diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index a9eff99a0c..2782d33e15 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -1,13 +1,21 @@ +import json +import os import random -from pathlib import Path +import time from typing import Any, Dict, Optional import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber -from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import TenantId, TimelineId +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber +from fixtures.pageserver.common_types import parse_layer_file_name +from fixtures.pageserver.utils import ( + assert_prefix_empty, + poll_for_remote_storage_iterations, + tenant_delete_wait_completed, + wait_for_upload_queue_empty, +) +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload @@ -43,9 +51,13 @@ def evict_random_layers( if "ephemeral" in layer.name or "temp_download" in layer.name: continue + layer_name = parse_layer_file_name(layer.name) + if rng.choice([True, False]): - log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}") - client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name) + log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer_name.to_str()}") + client.evict_layer( + tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer_name.to_str() + ) @pytest.mark.parametrize("seed", [1, 2, 3]) @@ -61,7 +73,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): """ neon_env_builder.num_pageservers = 3 neon_env_builder.enable_pageserver_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, + remote_storage_kind=s3_storage(), ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) @@ -70,23 +82,24 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - # We will make no effort to avoid stale attachments for ps in env.pageservers: ps.allowed_errors.extend( [ + # We will make no effort to avoid stale attachments ".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*", # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found ".*query handler.*Tenant.*not found.*", # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active ".*query handler.*Tenant.*not active.*", + # this shutdown case is logged at WARN severity by the time it bubbles up to logical size calculation code + # WARN ...: initial size calculation failed: downloading failed, possibly for shutdown + ".*downloading failed, possibly for shutdown", + # {tenant_id=... timeline_id=...}:handle_pagerequests:handle_get_page_at_lsn_request{rel=1664/0/1260 blkno=0 req_lsn=0/149F0D8}: error reading relation or page version: Not found: will not become active. Current state: Stopping\n' + ".*page_service.*will not become active.*", ] ) - # these can happen, if we shutdown at a good time. to be fixed as part of #5172. - message = ".*duplicated L1 layer layer=.*" - ps.allowed_errors.append(message) - workload = Workload(env, tenant_id, timeline_id) workload.init(env.pageservers[0].id) workload.write_rows(256, env.pageservers[0].id) @@ -135,11 +148,21 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): pageserver.stop() pageserver.start() if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id: + # /re-attach call will bump generation: track that in our state in case we do an + # "attach in same generation" operation later + assert last_state_ps[1] is not None # latest_attached == pageserfer.id implies this + # The re-attach API increments generation by exactly one. + new_generation = last_state_ps[1] + 1 + last_state[pageserver.id] = (last_state_ps[0], new_generation) + tenants = pageserver.http_client().tenant_list() + assert len(tenants) == 1 + assert tenants[0]["generation"] == new_generation + log.info("Entering postgres...") workload.churn_rows(rng.randint(128, 256), pageserver.id) workload.validate(pageserver.id) elif last_state_ps[0].startswith("Attached"): - # The `attachment_service` will only re-attach on startup when a pageserver was the + # The `storage_controller` will only re-attach on startup when a pageserver was the # holder of the latest generation: otherwise the pageserver will revert to detached # state if it was running attached with a stale generation last_state[pageserver.id] = ("Detached", None) @@ -164,12 +187,12 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): generation = last_state_ps[1] else: # Switch generations, while also jumping between attached states - generation = env.attachment_service.attach_hook_issue( + generation = env.storage_controller.attach_hook_issue( tenant_id, pageserver.id ) latest_attached = pageserver.id else: - generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id) + generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver.id) latest_attached = pageserver.id else: generation = None @@ -188,6 +211,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): ) workload.validate(pageserver.id) + # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check + # that the scrubber sees it and cleans it up. We do this before the final attach+validate pass, + # to also validate that the scrubber isn't breaking anything. + gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] > 0 + # Attach all pageservers for ps in env.pageservers: location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}} @@ -200,10 +230,11 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): # Detach all pageservers for ps in env.pageservers: location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}} + assert ps.list_layers(tenant_id, timeline_id) != [] ps.tenant_location_configure(tenant_id, location_conf) - # Confirm that all local disk state was removed on detach - # TODO + # Confirm that all local disk state was removed on detach + assert ps.list_layers(tenant_id, timeline_id) == [] def test_live_migration(neon_env_builder: NeonEnvBuilder): @@ -211,9 +242,8 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): Test the sequence of location states that are used in a live migration. """ neon_env_builder.num_pageservers = 2 - neon_env_builder.enable_pageserver_remote_storage( - remote_storage_kind=RemoteStorageKind.MOCK_S3, - ) + remote_storage_kind = RemoteStorageKind.MOCK_S3 + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind=remote_storage_kind) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) tenant_id = env.initial_tenant @@ -256,7 +286,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): # Encourage the new location to download while still in secondary mode pageserver_b.http_client().tenant_secondary_download(tenant_id) - migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id) + migrated_generation = env.storage_controller.attach_hook_issue(tenant_id, pageserver_b.id) log.info(f"Acquired generation {migrated_generation} for destination pageserver") assert migrated_generation == initial_generation + 1 @@ -329,6 +359,12 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): workload.churn_rows(64, pageserver_b.id) workload.validate(pageserver_b.id) + del workload + + # Check that deletion works properly on a tenant that was live-migrated + # (reproduce https://github.com/neondatabase/neon/issues/6802) + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations) def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): @@ -374,32 +410,6 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder): validate_heatmap(heatmap_second) -def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]: - """ - Inspect local storage on a pageserver to discover which layer files are present. - - :return: list of relative paths to layers, from the timeline root. - """ - timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) - - def relative(p: Path) -> Path: - return p.relative_to(timeline_path) - - return sorted( - list( - map( - relative, - filter( - lambda path: path.name != "metadata" - and "ephemeral" not in path.name - and "temp" not in path.name, - timeline_path.glob("*"), - ), - ) - ) - ) - - def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): """ Test the overall data flow in secondary mode: @@ -408,12 +418,17 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): - Eviction of layers on the attached location results in deletion on the secondary location as well. """ + + # For debug of https://github.com/neondatabase/neon/issues/6966 + neon_env_builder.rust_log_override = "DEBUG" + neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3, ) env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) - assert env.attachment_service is not None + assert env.storage_controller is not None + assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -443,10 +458,14 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): log.info("Synchronizing after initial write...") ps_attached.http_client().tenant_heatmap_upload(tenant_id) + # Ensure that everything which appears in the heatmap is also present in S3: heatmap writers + # are allowed to upload heatmaps that reference layers which are only enqueued for upload + wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id ) # Make changes on attached pageserver, check secondary downloads them @@ -455,11 +474,26 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): workload.churn_rows(128, ps_attached.id) ps_attached.http_client().tenant_heatmap_upload(tenant_id) + + # Ensure that everything which appears in the heatmap is also present in S3: heatmap writers + # are allowed to upload heatmaps that reference layers which are only enqueued for upload + wait_for_upload_queue_empty(ps_attached.http_client(), tenant_id, timeline_id) + ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id - ) + try: + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id + ) + except: + # Do a full listing of the secondary location on errors, to help debug of + # https://github.com/neondatabase/neon/issues/6966 + timeline_path = ps_secondary.timeline_dir(tenant_id, timeline_id) + for path, _dirs, files in os.walk(timeline_path): + for f in files: + log.info(f"Secondary file: {os.path.join(path, f)}") + + raise # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while # walreceiver is still doing something. @@ -469,23 +503,40 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Do evictions on attached pageserver, check secondary follows along # ================================================================== - log.info("Evicting a layer...") - layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0] - ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name) + try: + log.info("Evicting a layer...") + layer_to_evict = ps_attached.list_layers(tenant_id, timeline_id)[0] + some_other_layer = ps_attached.list_layers(tenant_id, timeline_id)[1] + log.info(f"Victim layer: {layer_to_evict.name}") + ps_attached.http_client().evict_layer( + tenant_id, timeline_id, layer_name=layer_to_evict.name + ) - log.info("Synchronizing after eviction...") - ps_attached.http_client().tenant_heatmap_upload(tenant_id) - ps_secondary.http_client().tenant_secondary_download(tenant_id) + log.info("Synchronizing after eviction...") + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id) + heatmap_layers = set( + layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"] + ) + assert layer_to_evict.name not in heatmap_layers + assert parse_layer_file_name(some_other_layer.name).to_str() in heatmap_layers - assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id - ) + ps_secondary.http_client().tenant_secondary_download(tenant_id) + + assert layer_to_evict not in ps_attached.list_layers(tenant_id, timeline_id) + assert ps_attached.list_layers(tenant_id, timeline_id) == ps_secondary.list_layers( + tenant_id, timeline_id + ) + except: + # On assertion failures, log some details to help with debugging + heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) + log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}") + raise # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata() + StorageScrubber(neon_env_builder).scan_metadata() # Detach secondary and delete tenant # =================================== @@ -504,7 +555,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10) assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -512,3 +563,236 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) ), ) + workload.stop() + + +def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): + """ + Slow test that runs in realtime, checks that the background scheduling of secondary + downloads happens as expected. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # Create this many tenants, each with two timelines + tenant_count = 4 + tenant_timelines = {} + + # This mirrors a constant in `downloader.rs` + default_download_period_secs = 60 + + # The upload period, which will also be the download once the secondary has seen its first heatmap + upload_period_secs = 30 + + for _i in range(0, tenant_count): + tenant_id = TenantId.generate() + timeline_a = TimelineId.generate() + timeline_b = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_a, + placement_policy='{"Attached":1}', + # Run with a low heatmap period so that we can avoid having to do synthetic API calls + # to trigger the upload promptly. + conf={"heatmap_period": f"{upload_period_secs}s"}, + ) + env.neon_cli.create_timeline("main2", tenant_id, timeline_b) + + tenant_timelines[tenant_id] = [timeline_a, timeline_b] + + def await_log(pageserver, deadline, expression): + """ + Wrapper around assert_log_contains that waits with a deadline rather than timeout + """ + now = time.time() + if now > deadline: + raise RuntimeError(f"Timed out waiting for {expression}") + else: + timeout = int(deadline - now) + 1 + try: + wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) # type: ignore + except: + log.error(f"Timed out waiting for '{expression}'") + raise + + t_start = time.time() + + # Wait long enough that the background downloads should happen; we expect all the inital layers + # of all the initial timelines to show up on the secondary location of each tenant. + initial_download_deadline = time.time() + default_download_period_secs * 3 + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + now = time.time() + if now > initial_download_deadline: + raise RuntimeError("Timed out waiting for initial secondary download") + else: + for timeline_id in timelines: + log.info( + f"Waiting for downloads of timeline {timeline_id} on secondary pageserver {ps_secondary.id}" + ) + await_log( + ps_secondary, + initial_download_deadline, + f".*{timeline_id}.*Wrote timeline_detail.*", + ) + + for timeline_id in timelines: + log.info( + f"Checking for secondary timeline downloads {timeline_id} on node {ps_secondary.id}" + ) + # One or more layers should be present for all timelines + assert ps_secondary.list_layers(tenant_id, timeline_id) + + # Delete the second timeline: this should be reflected later on the secondary + env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1]) + + # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor + deletion_deadline = time.time() + upload_period_secs * 3 + + for tenant_id, timelines in tenant_timelines.items(): + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + # We only have two: the other one must be secondary + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + expect_del_timeline = timelines[1] + log.info( + f"Waiting for deletion of timeline {expect_del_timeline} on secondary pageserver {ps_secondary.id}" + ) + await_log( + ps_secondary, + deletion_deadline, + f".*Timeline no longer in heatmap.*{expect_del_timeline}.*", + ) + + # This one was not deleted + assert ps_secondary.list_layers(tenant_id, timelines[0]) + + # This one was deleted + log.info( + f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}" + ) + assert not ps_secondary.list_layers(tenant_id, expect_del_timeline) + + t_end = time.time() + + # Measure how many heatmap downloads we did in total: this checks that we succeeded with + # proper scheduling, and not some bug that just runs downloads in a loop. + total_heatmap_downloads = 0 + for ps in env.pageservers: + v = ps.http_client().get_metric_value("pageserver_secondary_download_heatmap_total") + assert v is not None + total_heatmap_downloads += int(v) + + download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start) + + expect_download_rate = 1.0 / upload_period_secs + log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min") + + assert download_rate < expect_download_rate * 2 + + +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +@pytest.mark.parametrize("via_controller", [True, False]) +def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool): + """ + Test use of secondary download API for slow downloads, where slow means either a healthy + system with a large capacity shard, or some unhealthy remote storage. + + The download API is meant to respect a client-supplied time limit, and return 200 or 202 + selectively based on whether the download completed. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + env.neon_cli.create_tenant( + tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' + ) + + attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] + ps_attached = env.get_pageserver(attached_to_id) + ps_secondary = next(p for p in env.pageservers if p != ps_attached) + + # Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis) + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + workload.write_rows(128) + ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id) + + # Expect lots of layers + assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10 + + # Simulate large data by making layer downloads artifically slow + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + + # Upload a heatmap, so that secondaries have something to download + ps_attached.http_client().tenant_heatmap_upload(tenant_id) + + if via_controller: + http_client = env.storage_controller.pageserver_api() + http_client.tenant_location_conf( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + "generation": None, + }, + ) + else: + http_client = ps_secondary.http_client() + + # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms + (status, progress_1) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000) + assert status == 202 + assert progress_1["heatmap_mtime"] is not None + assert progress_1["layers_downloaded"] > 0 + assert progress_1["bytes_downloaded"] > 0 + assert progress_1["layers_total"] > progress_1["layers_downloaded"] + assert progress_1["bytes_total"] > progress_1["bytes_downloaded"] + + # Multiple polls should work: use a shorter wait period this time + (status, progress_2) = http_client.tenant_secondary_download(tenant_id, wait_ms=1000) + assert status == 202 + assert progress_2["heatmap_mtime"] is not None + assert progress_2["layers_downloaded"] > 0 + assert progress_2["bytes_downloaded"] > 0 + assert progress_2["layers_total"] > progress_2["layers_downloaded"] + assert progress_2["bytes_total"] > progress_2["bytes_downloaded"] + + # Progress should be >= the first poll: this can only go backward if we see a new heatmap, + # and the heatmap period on the attached node is much longer than the runtime of this test, so no + # new heatmap should have been uploaded. + assert progress_2["layers_downloaded"] >= progress_1["layers_downloaded"] + assert progress_2["bytes_downloaded"] >= progress_1["bytes_downloaded"] + assert progress_2["layers_total"] == progress_1["layers_total"] + assert progress_2["bytes_total"] == progress_1["bytes_total"] + + # Make downloads fast again: when the download completes within this last request, we + # get a 200 instead of a 202 + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")]) + (status, progress_3) = http_client.tenant_secondary_download(tenant_id, wait_ms=20000) + assert status == 200 + assert progress_3["heatmap_mtime"] is not None + assert progress_3["layers_total"] == progress_3["layers_downloaded"] + assert progress_3["bytes_total"] == progress_3["bytes_downloaded"] diff --git a/test_runner/regress/test_parallel_copy.py b/test_runner/regress/test_parallel_copy.py index 6f74d50b92..b33e387a66 100644 --- a/test_runner/regress/test_parallel_copy.py +++ b/test_runner/regress/test_parallel_copy.py @@ -1,7 +1,6 @@ import asyncio from io import BytesIO -from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnv @@ -44,7 +43,6 @@ def test_parallel_copy(neon_simple_env: NeonEnv, n_parallel=5): env = neon_simple_env env.neon_cli.create_branch("test_parallel_copy", "empty") endpoint = env.endpoints.create_start("test_parallel_copy") - log.info("postgres is running on 'test_parallel_copy' branch") # Create test table conn = endpoint.connect() diff --git a/test_runner/regress/test_pg_query_cancellation.py b/test_runner/regress/test_pg_query_cancellation.py new file mode 100644 index 0000000000..bad2e5865e --- /dev/null +++ b/test_runner/regress/test_pg_query_cancellation.py @@ -0,0 +1,282 @@ +from contextlib import closing +from typing import Set + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonPageserver +from fixtures.pageserver.http import PageserverHttpClient +from psycopg2.errors import QueryCanceled + +CRITICAL_PG_PS_WAIT_FAILPOINTS: Set[str] = { + "ps::connection-start::pre-login", + "ps::connection-start::startup-packet", + "ps::connection-start::process-query", + "ps::handle-pagerequest-message::exists", + "ps::handle-pagerequest-message::nblocks", + "ps::handle-pagerequest-message::getpage", + "ps::handle-pagerequest-message::dbsize", + # We don't yet have a good way to on-demand guarantee the download of an + # SLRU segment, so that's disabled for now. + # "ps::handle-pagerequest-message::slrusegment", +} + +PG_PS_START_FAILPOINTS = { + "ps::connection-start::pre-login", + "ps::connection-start::startup-packet", + "ps::connection-start::process-query", +} +SMGR_EXISTS = "ps::handle-pagerequest-message::exists" +SMGR_NBLOCKS = "ps::handle-pagerequest-message::nblocks" +SMGR_GETPAGE = "ps::handle-pagerequest-message::getpage" +SMGR_DBSIZE = "ps::handle-pagerequest-message::dbsize" + +""" +Test that we can handle connection delays and cancellations at various +unfortunate connection startup and request states. +""" + + +def test_cancellations(neon_simple_env: NeonEnv): + env = neon_simple_env + ps = env.pageserver + ps_http = ps.http_client() + ps_http.is_testing_enabled_or_skip() + + env.neon_cli.create_branch("test_config", "empty") + + # We don't want to have any racy behaviour with autovacuum IOs + ep = env.endpoints.create_start( + "test_config", + config_lines=[ + "autovacuum = off", + "shared_buffers = 128MB", + ], + ) + + with closing(ep.connect()) as conn: + with conn.cursor() as cur: + cur.execute( + """ + CREATE TABLE test1 AS + SELECT id, sha256(id::text::bytea) payload + FROM generate_series(1, 1024::bigint) p(id); + """ + ) + cur.execute( + """ + CREATE TABLE test2 AS + SELECT id, sha256(id::text::bytea) payload + FROM generate_series(1025, 2048::bigint) p(id); + """ + ) + cur.execute( + """ + VACUUM (ANALYZE, FREEZE) test1, test2; + """ + ) + cur.execute( + """ + CREATE EXTENSION pg_buffercache; + """ + ) + cur.execute( + """ + CREATE EXTENSION pg_prewarm; + """ + ) + + # data preparation is now complete, with 2 disjoint tables that aren't + # preloaded into any caches. + + ep.stop() + + for failpoint in CRITICAL_PG_PS_WAIT_FAILPOINTS: + connect_works_correctly(failpoint, ep, ps, ps_http) + + +ENABLED_FAILPOINTS: Set[str] = set() + + +def connect_works_correctly( + failpoint: str, ep: Endpoint, ps: NeonPageserver, ps_http: PageserverHttpClient +): + log.debug("Starting work on %s", failpoint) + # All queries we use should finish (incl. IO) within 500ms, + # including all their IO. + # This allows us to use `SET statement_timeout` to let the query + # timeout system cancel queries, rather than us having to go + # through the most annoying effort of manual query cancellation + # in psycopg2. + options = "-cstatement_timeout=500ms -ceffective_io_concurrency=1" + + ep.start() + + def fp_enable(): + global ENABLED_FAILPOINTS + ps_http.configure_failpoints( + [ + (failpoint, "pause"), + ] + ) + ENABLED_FAILPOINTS = ENABLED_FAILPOINTS | {failpoint} + log.info( + 'Enabled failpoint "%s", current_active=%s', failpoint, ENABLED_FAILPOINTS, stacklevel=2 + ) + + def fp_disable(): + global ENABLED_FAILPOINTS + ps_http.configure_failpoints( + [ + (failpoint, "off"), + ] + ) + ENABLED_FAILPOINTS = ENABLED_FAILPOINTS - {failpoint} + log.info( + 'Disabled failpoint "%s", current_active=%s', + failpoint, + ENABLED_FAILPOINTS, + stacklevel=2, + ) + + def check_buffers(cur): + cur.execute( + """ + SELECT n.nspname AS nspname + , c.relname AS relname + , count(*) AS count + FROM pg_buffercache b + JOIN pg_class c + ON b.relfilenode = pg_relation_filenode(c.oid) AND + b.reldatabase = (SELECT oid FROM pg_database WHERE datname = current_database()) + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE c.oid IN ('test1'::regclass::oid, 'test2'::regclass::oid) + GROUP BY n.nspname, c.relname + ORDER BY 3 DESC + LIMIT 10 + """ + ) + return cur.fetchone() + + def exec_may_cancel(query, cursor, result, cancels): + if cancels: + with pytest.raises(QueryCanceled): + cursor.execute(query) + assert cursor.fetchone() == result + else: + cursor.execute(query) + assert cursor.fetchone() == result + + fp_disable() + + # Warm caches required for new connections, so that they can run without + # requiring catalog reads. + with closing(ep.connect()) as conn: + with conn.cursor() as cur: + cur.execute( + """ + SELECT 1; + """ + ) + assert cur.fetchone() == (1,) + + assert check_buffers(cur) is None + # Ensure all caches required for connection start are correctly + # filled, so that we don't have any "accidents" in this test run + # caused by changes in connection startup plans that require + # requests to the PageServer. + cur.execute( + """ + select array_agg(distinct (pg_prewarm(c.oid::regclass, 'buffer') >= 0)) + from pg_class c + where c.oid < 16384 AND c.relkind IN ('i', 'r'); + """ + ) + assert cur.fetchone() == ([True],) + + # Enable failpoint + fp_enable() + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + cur.execute("SHOW statement_timeout;") + assert cur.fetchone() == ("500ms",) + assert check_buffers(cur) is None + exec_may_cancel( + """ + SELECT min(id) FROM test1; + """, + cur, + (1,), + failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}), + ) + + fp_disable() + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + # Do a select on the data, putting some buffers into the prefetch + # queue. + cur.execute( + """ + SELECT count(id) FROM (select * from test1 LIMIT 256) a; + """ + ) + assert cur.fetchone() == (256,) + + ps.stop() + ps.start() + fp_enable() + + exec_may_cancel( + """ + SELECT COUNT(id) FROM test1; + """, + cur, + (1024,), + failpoint + in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_NBLOCKS, SMGR_DBSIZE}), + ) + + with closing(ep.connect(options=options, autocommit=True)) as conn: + with conn.cursor() as cur: + exec_may_cancel( + """ + SELECT COUNT(id) FROM test2; + """, + cur, + (1024,), + failpoint in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_DBSIZE}), + ) + + fp_disable() + fp_enable() + + exec_may_cancel( + """ + SELECT 0 < pg_database_size(CURRENT_DATABASE()); + """, + cur, + (True,), + failpoint + in (CRITICAL_PG_PS_WAIT_FAILPOINTS - {SMGR_EXISTS, SMGR_GETPAGE, SMGR_NBLOCKS}), + ) + + fp_disable() + + cur.execute( + """ + SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test2; + """ + ) + + assert cur.fetchone() == (1024, 1024, 1025, 2048, 1573376) + + cur.execute( + """ + SELECT count(id), count(distinct payload), min(id), max(id), sum(id) FROM test1; + """ + ) + + assert cur.fetchone() == (1024, 1024, 1, 1024, 524800) + + ep.stop() diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index f26d04e2f3..885a94a557 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -1,27 +1,59 @@ # # This file runs pg_regress-based tests. # -from pathlib import Path +from __future__ import annotations -from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content +from pathlib import Path +from typing import TYPE_CHECKING, cast + +import pytest +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + check_restored_datadir_content, +) +from fixtures.pg_version import PgVersion +from fixtures.remote_storage import s3_storage + +if TYPE_CHECKING: + from typing import Optional + + from fixtures.neon_fixtures import PgBin + from pytest import CaptureFixture # Run the main PostgreSQL regression tests, in src/test/regress. # +@pytest.mark.parametrize("shard_count", [None, 4]) def test_pg_regress( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + build_type: str, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + DBNAME = "regression" + + """ + :param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this + many shards. + """ + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + + if build_type == "debug": + # Disable vectored read path cross validation since it makes the test time out. + neon_env_builder.pageserver_config_override = "validate_vectored_get=false" + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - env.neon_cli.create_branch("test_pg_regress", "empty") # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("test_pg_regress") - endpoint.safe_psql("CREATE DATABASE regression") + endpoint = env.endpoints.create_start("main") + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" @@ -56,27 +88,90 @@ def test_pg_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - check_restored_datadir_content(test_output_dir, env, endpoint) + ignored_files: Optional[list[str]] = None + + # Neon handles unlogged relations in a special manner. During a + # basebackup, we ship the init fork as the main fork. This presents a + # problem in that the endpoint's data directory and the basebackup will + # have differences and will fail the eventual file comparison. + # + # Unlogged tables were introduced in version 9.1. ALTER TABLE grew + # support for setting the persistence of a table in 9.5. The reason that + # this doesn't affect versions < 15 (but probably would between 9.1 and + # 9.5) is that all the regression tests that deal with unlogged tables + # up until that point dropped the unlogged tables or set them to logged + # at some point during the test. + # + # In version 15, Postgres grew support for unlogged sequences, and with + # that came a few more regression tests. These tests did not all drop + # the unlogged tables/sequences prior to finishing. + # + # But unlogged sequences came with a bug in that, sequences didn't + # inherit the persistence of their "parent" tables if they had one. This + # was fixed and backported to 15, thus exacerbating our problem a bit. + # + # So what we can do is just ignore file differences between the data + # directory and basebackup for unlogged relations. + results = cast( + "list[tuple[str, str]]", + endpoint.safe_psql( + """ + SELECT + relkind, + pg_relation_filepath( + pg_filenode_relation(reltablespace, relfilenode) + ) AS unlogged_relation_paths + FROM pg_class + WHERE relpersistence = 'u' + """, + dbname=DBNAME, + ), + ) + + unlogged_relation_files: list[str] = [] + for r in results: + unlogged_relation_files.append(r[1]) + # This is related to the following Postgres commit: + # + # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b + # Author: Heikki Linnakangas + # Date: 2023-08-23 09:21:31 -0500 + # + # Use the buffer cache when initializing an unlogged index. + # + # This patch was backpatched to 16. Without it, the LSN in the + # page header would be 0/0 in the data directory, which wouldn't + # match the LSN generated during the basebackup, thus creating + # a difference. + if env.pg_version <= PgVersion.V15 and r[0] == "i": + unlogged_relation_files.append(f"{r[1]}_init") + + ignored_files = unlogged_relation_files + + check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) # Run the PostgreSQL "isolation" tests, in src/test/isolation. # +@pytest.mark.parametrize("shard_count", [None, 4]) def test_isolation( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - env.neon_cli.create_branch("test_isolation", "empty") # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them - endpoint = env.endpoints.create_start( - "test_isolation", config_lines=["max_prepared_transactions=100"] - ) + endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"]) endpoint.safe_psql("CREATE DATABASE isolation_regression") # Create some local directories for pg_isolation_regress to run in. @@ -114,19 +209,24 @@ def test_isolation( # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. +@pytest.mark.parametrize("shard_count", [None, 4]) def test_sql_regress( - neon_simple_env: NeonEnv, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - pg_bin, - capsys, + pg_bin: PgBin, + capsys: CaptureFixture[str], base_dir: Path, pg_distrib_dir: Path, + shard_count: Optional[int], ): - env = neon_simple_env + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) - env.neon_cli.create_branch("test_sql_regress", "empty") # Connect to postgres and create a database called "regression". - endpoint = env.endpoints.create_start("test_sql_regress") + endpoint = env.endpoints.create_start("main") endpoint.safe_psql("CREATE DATABASE regression") # Create some local directories for pg_regress to run in. diff --git a/test_runner/regress/test_pg_waldump.py b/test_runner/regress/test_pg_waldump.py new file mode 100644 index 0000000000..8e80efd9ba --- /dev/null +++ b/test_runner/regress/test_pg_waldump.py @@ -0,0 +1,61 @@ +import os +import shutil + +from fixtures.neon_fixtures import NeonEnv, PgBin +from fixtures.utils import subprocess_capture + + +def check_wal_segment(pg_waldump_path: str, segment_path: str, test_output_dir): + # use special --ignore option to ignore the validation checks in pg_waldump + # this is necessary, because neon WAL files contain gap at the beginning + output_path, _, _ = subprocess_capture( + test_output_dir, [pg_waldump_path, "--ignore", segment_path] + ) + + with open(f"{output_path}.stdout", "r") as f: + stdout = f.read() + assert "ABORT" in stdout + assert "COMMIT" in stdout + + +# Simple test to check that pg_waldump works with neon WAL files +def test_pg_waldump(neon_simple_env: NeonEnv, test_output_dir, pg_bin: PgBin): + env = neon_simple_env + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_pg_waldump", "empty") + endpoint = env.endpoints.create_start("test_pg_waldump") + + cur = endpoint.connect().cursor() + cur.execute( + """ + BEGIN; + CREATE TABLE t1(i int primary key, n_updated int); + INSERT INTO t1 select g, 0 from generate_series(1, 50) g; + ROLLBACK; + """ + ) + + cur.execute( + """ + BEGIN; + CREATE TABLE t1(i int primary key, n_updated int); + INSERT INTO t1 select g, 0 from generate_series(1, 50) g; + COMMIT; + """ + ) + + # stop the endpoint to make sure that WAL files are flushed and won't change + endpoint.stop() + + assert endpoint.pgdata_dir + wal_path = os.path.join(endpoint.pgdata_dir, "pg_wal/000000010000000000000001") + pg_waldump_path = os.path.join(pg_bin.pg_bin_path, "pg_waldump") + # check segment on compute + check_wal_segment(pg_waldump_path, wal_path, test_output_dir) + + # Check file on safekeepers as well. pg_waldump is strict about file naming, so remove .partial suffix. + sk = env.safekeepers[0] + sk_tli_dir = sk.timeline_dir(tenant_id, timeline_id) + non_partial_path = os.path.join(sk_tli_dir, "000000010000000000000001") + shutil.copyfile(os.path.join(sk_tli_dir, "000000010000000000000001.partial"), non_partial_path) + check_wal_segment(pg_waldump_path, non_partial_path, test_output_dir) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index c2ea5b332a..7e676b5515 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -1,6 +1,6 @@ +from fixtures.common_types import TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder -from fixtures.types import TimelineId from fixtures.utils import print_gc_result, query_scalar @@ -10,13 +10,10 @@ from fixtures.utils import print_gc_result, query_scalar # def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # Set pitr interval such that we need to keep the data - neon_env_builder.pageserver_config_override = ( - "tenant_config={pitr_interval = '1 day', gc_horizon = 0}" + env = neon_env_builder.init_start( + initial_tenant_conf={"pitr_interval": "1 day", "gc_horizon": "0"} ) - - env = neon_env_builder.init_start() endpoint_main = env.endpoints.create_start("main") - log.info("postgres is running on 'main' branch") main_pg_conn = endpoint_main.connect() main_cur = main_pg_conn.cursor() diff --git a/test_runner/regress/test_postgres_version.py b/test_runner/regress/test_postgres_version.py new file mode 100644 index 0000000000..03e8c7c0df --- /dev/null +++ b/test_runner/regress/test_postgres_version.py @@ -0,0 +1,35 @@ +import json +import re +from pathlib import Path + +from fixtures.neon_fixtures import PgBin +from fixtures.pg_version import PgVersion + + +def test_postgres_version(base_dir: Path, pg_bin: PgBin, pg_version: PgVersion): + """Test that Postgres version matches the one we expect""" + + with (base_dir / "vendor" / "revisions.json").open() as f: + expected_revisions = json.load(f) + + output_prefix = pg_bin.run_capture(["postgres", "--version"], with_command_header=False) + stdout = Path(f"{output_prefix}.stdout") + assert stdout.exists(), "postgres --version didn't print anything to stdout" + + with stdout.open() as f: + output = f.read().strip() + + # `postgres --version` prints something like "postgres (PostgreSQL) 15.6 (85d809c124a898847a97d66a211f7d5ef4f8e0cb)". + pattern = r"postgres \(PostgreSQL\) (?P\d+\.\d+) \((?P[0-9a-f]{40})\)" + match = re.search(pattern, output, re.IGNORECASE) + assert match is not None, f"Can't parse {output} with {pattern}" + + version = match.group("version") + commit = match.group("commit") + + assert ( + pg_version.v_prefixed in expected_revisions + ), f"Version `{pg_version.v_prefixed}` doesn't exist in `vendor/revisions.json`, please update it if these changes are intentional" + + msg = f"Unexpected Postgres {pg_version} version: `{output}`, please update `vendor/revisions.json` if these changes are intentional" + assert [version, commit] == expected_revisions[pg_version.v_prefixed], msg diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 0f2cd9768f..f446f4f200 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -203,6 +203,21 @@ def test_close_on_connections_exit(static_proxy: NeonProxy): static_proxy.wait_for_exit() +def test_sql_over_http_serverless_driver(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" + response = requests.post( + f"https://api.localtest.me:{static_proxy.external_http_port}/sql", + data=json.dumps({"query": "select 42 as answer", "params": []}), + headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr}, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200, response.text + rows = response.json()["rows"] + assert rows == [{"answer": 42}] + + def test_sql_over_http(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") @@ -375,14 +390,47 @@ def test_sql_over_http_batch(static_proxy: NeonProxy): assert result[0]["rows"] == [{"answer": 42}] +def test_sql_over_http_batch_output_options(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" + response = requests.post( + f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + data=json.dumps( + { + "queries": [ + {"query": "select $1 as answer", "params": [42], "arrayMode": True}, + {"query": "select $1 as answer", "params": [42], "arrayMode": False}, + ] + } + ), + headers={ + "Content-Type": "application/sql", + "Neon-Connection-String": connstr, + "Neon-Batch-Isolation-Level": "Serializable", + "Neon-Batch-Read-Only": "false", + "Neon-Batch-Deferrable": "false", + }, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200 + results = response.json()["results"] + + assert results[0]["rowAsArray"] + assert results[0]["rows"] == [["42"]] + + assert not results[1]["rowAsArray"] + assert results[1]["rows"] == [{"answer": "42"}] + + def test_sql_over_http_pool(static_proxy: NeonProxy): static_proxy.safe_psql("create user http_auth with password 'http' superuser") - def get_pid(status: int, pw: str) -> Any: + def get_pid(status: int, pw: str, user="http_auth") -> Any: return static_proxy.http_query( GET_CONNECTION_PID_QUERY, [], - user="http_auth", + user=user, password=pw, expected_code=status, ) @@ -403,23 +451,29 @@ def test_sql_over_http_pool(static_proxy: NeonProxy): static_proxy.safe_psql("alter user http_auth with password 'http2'") - # after password change, should open a new connection to verify it - pid2 = get_pid(200, "http2")["rows"][0]["pid"] - assert pid1 != pid2 + # after password change, shouldn't open a new connection because it checks password in proxy. + rows = get_pid(200, "http2")["rows"] + assert rows == [{"pid": pid1}] time.sleep(0.02) - # query should be on an existing connection - pid = get_pid(200, "http2")["rows"][0]["pid"] - assert pid in [pid1, pid2] - - time.sleep(0.02) - - # old password should not work - res = get_pid(400, "http") + # incorrect user shouldn't reveal that the user doesn't exists + res = get_pid(400, "http", user="http_auth2") assert "password authentication failed for user" in res["message"] +def test_sql_over_http_urlencoding(static_proxy: NeonProxy): + static_proxy.safe_psql("create user \"http+auth$$\" with password '%+$^&*@!' superuser") + + static_proxy.http_query( + "select 1", + [], + user="http+auth$$", + password="%+$^&*@!", + expected_code=200, + ) + + # Beginning a transaction should not impact the next query, # which might come from a completely different client. def test_http_pool_begin(static_proxy: NeonProxy): @@ -500,3 +554,49 @@ def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy): "select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data", ) assert response["rows"][0]["data"] == ["foo", "bar", "baz"] + + +@pytest.mark.asyncio +async def test_sql_over_http2(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + resp = await static_proxy.http2_query( + "select 42 as answer", [], user="http", password="http", expected_code=200 + ) + assert resp["rows"] == [{"answer": 42}] + + +def test_sql_over_http_connection_cancel(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + static_proxy.safe_psql("create table test_table ( id int primary key )") + + # insert into a table, with a unique constraint, after sleeping for n seconds + query = "WITH temp AS ( \ + SELECT pg_sleep($1) as sleep, $2::int as id \ + ) INSERT INTO test_table (id) SELECT id FROM temp" + + try: + # The request should complete before the proxy HTTP timeout triggers. + # Timeout and cancel the request on the client side before the query completes. + static_proxy.http_query( + query, + [static_proxy.http_timeout_seconds - 1, 1], + user="http", + password="http", + timeout=2, + ) + except requests.exceptions.ReadTimeout: + pass + + # wait until the query _would_ have been complete + time.sleep(static_proxy.http_timeout_seconds) + + res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200) + assert res["command"] == "INSERT", "HTTP query should insert" + assert res["rowCount"] == 1, "HTTP query should insert" + + res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400) + assert ( + "duplicate key value violates unique constraint" in res["message"] + ), "HTTP query should conflict" diff --git a/test_runner/regress/test_proxy_allowed_ips.py b/test_runner/regress/test_proxy_allowed_ips.py index f533579811..7a804114ba 100644 --- a/test_runner/regress/test_proxy_allowed_ips.py +++ b/test_runner/regress/test_proxy_allowed_ips.py @@ -24,7 +24,7 @@ async def test_proxy_psql_allowed_ips(static_proxy: NeonProxy, vanilla_pg: Vanil with pytest.raises(psycopg2.Error) as exprinfo: static_proxy.safe_psql(**kwargs) text = str(exprinfo.value).strip() - assert "This IP address is not allowed to connect" in text + assert "not allowed to connect" in text # no SNI, deprecated `options=project` syntax (before we had several endpoint in project) check_cannot_connect(query="select 1", sslsni=0, options="project=private-project") diff --git a/test_runner/regress/test_proxy_rate_limiter.py b/test_runner/regress/test_proxy_rate_limiter.py deleted file mode 100644 index f39f0cad07..0000000000 --- a/test_runner/regress/test_proxy_rate_limiter.py +++ /dev/null @@ -1,84 +0,0 @@ -import asyncio -import time -from pathlib import Path -from typing import Iterator - -import pytest -from fixtures.neon_fixtures import ( - PSQL, - NeonProxy, -) -from fixtures.port_distributor import PortDistributor -from pytest_httpserver import HTTPServer -from werkzeug.wrappers.response import Response - - -def waiting_handler(status_code: int) -> Response: - # wait more than timeout to make sure that both (two) connections are open. - # It would be better to use a barrier here, but I don't know how to do that together with pytest-httpserver. - time.sleep(2) - return Response(status=status_code) - - -@pytest.fixture(scope="function") -def proxy_with_rate_limit( - port_distributor: PortDistributor, - neon_binpath: Path, - httpserver_listen_address, - test_output_dir: Path, -) -> Iterator[NeonProxy]: - """Neon proxy that routes directly to vanilla postgres.""" - - proxy_port = port_distributor.get_port() - mgmt_port = port_distributor.get_port() - http_port = port_distributor.get_port() - external_http_port = port_distributor.get_port() - (host, port) = httpserver_listen_address - endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" - - with NeonProxy( - neon_binpath=neon_binpath, - test_output_dir=test_output_dir, - proxy_port=proxy_port, - http_port=http_port, - mgmt_port=mgmt_port, - external_http_port=external_http_port, - auth_backend=NeonProxy.Console(endpoint, fixed_rate_limit=5), - ) as proxy: - proxy.start() - yield proxy - - -@pytest.mark.asyncio -async def test_proxy_rate_limit( - httpserver: HTTPServer, - proxy_with_rate_limit: NeonProxy, -): - uri = "/billing/api/v1/usage_events/proxy_get_role_secret" - # mock control plane service - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: Response(status=200) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(429) - ) - httpserver.expect_ordered_request(uri, method="GET").respond_with_handler( - lambda _: waiting_handler(500) - ) - - psql = PSQL(host=proxy_with_rate_limit.host, port=proxy_with_rate_limit.proxy_port) - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - # Limit should be 2. - - # Run two queries in parallel. - f1, f2 = await asyncio.gather(psql.run("select 42;"), psql.run("select 42;")) - await proxy_with_rate_limit.find_auth_link(uri, f1) - await proxy_with_rate_limit.find_auth_link(uri, f2) - - # Now limit should be 0. - f = await psql.run("select 42;") - await proxy_with_rate_limit.find_auth_link(uri, f) - - # There last query shouldn't reach the http-server. - assert httpserver.assertions == [] diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py new file mode 100644 index 0000000000..6211446a40 --- /dev/null +++ b/test_runner/regress/test_proxy_websockets.py @@ -0,0 +1,196 @@ +import ssl + +import pytest +import websockets +from fixtures.neon_fixtures import NeonProxy + + +@pytest.mark.asyncio +async def test_websockets(static_proxy: NeonProxy): + static_proxy.safe_psql("create user ws_auth with password 'ws' superuser") + + user = "ws_auth" + password = "ws" + + version = b"\x00\x03\x00\x00" + params = { + "user": user, + "database": "postgres", + "client_encoding": "UTF8", + } + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt")) + + async with websockets.connect( + f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + ssl=ssl_context, + ) as websocket: + startup_message = bytearray(version) + for key, value in params.items(): + startup_message.extend(key.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(value.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(b"\0") + length = (4 + len(startup_message)).to_bytes(4, byteorder="big") + + await websocket.send([length, startup_message]) + + startup_response = await websocket.recv() + assert isinstance(startup_response, bytes) + assert startup_response[0:1] == b"R", "should be authentication message" + assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext" + + auth_message = password.encode("utf-8") + b"\0" + length = (4 + len(auth_message)).to_bytes(4, byteorder="big") + await websocket.send([b"p", length, auth_message]) + + auth_response = await websocket.recv() + assert isinstance(auth_response, bytes) + assert auth_response[0:1] == b"R", "should be authentication message" + assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated" + + query_message = "SELECT 1".encode("utf-8") + b"\0" + length = (4 + len(query_message)).to_bytes(4, byteorder="big") + await websocket.send([b"Q", length, query_message]) + + query_response = await websocket.recv() + assert isinstance(query_response, bytes) + # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00' + # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011' + # 'C\x00\x00\x00\rSELECT 1\x00' + # 'Z\x00\x00\x00\x05I' + + assert query_response[0:1] == b"T", "should be row description message" + row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + row_description, query_response = ( + query_response[:row_description_len], + query_response[row_description_len:], + ) + assert row_description[5:7] == b"\x00\x01", "should have 1 column" + assert row_description[7:16] == b"?column?\0", "column should be named ?column?" + assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4" + + assert query_response[0:1] == b"D", "should be data row message" + data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + data_row, query_response = query_response[:data_row_len], query_response[data_row_len:] + assert ( + data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011" + ), "should contain 1 column with text value 1" + + assert query_response[0:1] == b"C", "should be command complete message" + command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + command_complete, query_response = ( + query_response[:command_complete_len], + query_response[command_complete_len:], + ) + assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0" + + assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)" + + # close + await websocket.send(b"X\x00\x00\x00\x04") + await websocket.wait_closed() + + +@pytest.mark.asyncio +async def test_websockets_pipelined(static_proxy: NeonProxy): + """ + Test whether we can send the startup + auth + query all in one go + """ + + static_proxy.safe_psql("create user ws_auth with password 'ws' superuser") + + user = "ws_auth" + password = "ws" + + version = b"\x00\x03\x00\x00" + params = { + "user": user, + "database": "postgres", + "client_encoding": "UTF8", + } + + ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt")) + + async with websockets.connect( + f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + ssl=ssl_context, + ) as websocket: + startup_message = bytearray(version) + for key, value in params.items(): + startup_message.extend(key.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(value.encode("ascii")) + startup_message.extend(b"\0") + startup_message.extend(b"\0") + length0 = (4 + len(startup_message)).to_bytes(4, byteorder="big") + + auth_message = password.encode("utf-8") + b"\0" + length1 = (4 + len(auth_message)).to_bytes(4, byteorder="big") + query_message = "SELECT 1".encode("utf-8") + b"\0" + length2 = (4 + len(query_message)).to_bytes(4, byteorder="big") + await websocket.send( + length0 + + startup_message + + b"p" + + length1 + + auth_message + + b"Q" + + length2 + + query_message + ) + + startup_response = await websocket.recv() + assert isinstance(startup_response, bytes) + assert startup_response[0:1] == b"R", "should be authentication message" + assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext" + + auth_response = await websocket.recv() + assert isinstance(auth_response, bytes) + assert auth_response[0:1] == b"R", "should be authentication message" + assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message" + assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated" + + query_response = await websocket.recv() + assert isinstance(query_response, bytes) + # 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00' + # 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011' + # 'C\x00\x00\x00\rSELECT 1\x00' + # 'Z\x00\x00\x00\x05I' + + assert query_response[0:1] == b"T", "should be row description message" + row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + row_description, query_response = ( + query_response[:row_description_len], + query_response[row_description_len:], + ) + assert row_description[5:7] == b"\x00\x01", "should have 1 column" + assert row_description[7:16] == b"?column?\0", "column should be named ?column?" + assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4" + + assert query_response[0:1] == b"D", "should be data row message" + data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + data_row, query_response = query_response[:data_row_len], query_response[data_row_len:] + assert ( + data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011" + ), "should contain 1 column with text value 1" + + assert query_response[0:1] == b"C", "should be command complete message" + command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1 + command_complete, query_response = ( + query_response[:command_complete_len], + query_response[command_complete_len:], + ) + assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0" + + assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)" + + # close + await websocket.send(b"X\x00\x00\x00\x04") + await websocket.wait_closed() diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py index e6b3ccd7ec..cc5853b727 100644 --- a/test_runner/regress/test_read_trace.py +++ b/test_runner/regress/test_read_trace.py @@ -1,8 +1,8 @@ from contextlib import closing +from fixtures.common_types import Lsn from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index d695410efc..2437c8f806 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -17,13 +17,19 @@ def test_read_validation(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_read_validation", "empty") - endpoint = env.endpoints.create_start("test_read_validation") - log.info("postgres is running on 'test_read_validation' branch") + endpoint = env.endpoints.create_start( + "test_read_validation", + # Use protocol version 2, because the code that constructs the V1 messages + # assumes that a primary always wants to read the latest version of a page, + # and therefore doesn't work with the test functions below to read an older + # page version. + config_lines=["neon.protocol_version=2"], + ) with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -43,14 +49,12 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Test table is populated, validating buffer cache") cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries > 0, "No buffers cached for the test relation" c.execute( - "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format( - relfilenode - ) + f"select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {relfilenode}" ) reln = c.fetchone() assert reln is not None @@ -60,35 +64,33 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select clear_buffer_cache()") cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "Failed to clear buffer cache" log.info("Cache is clear, reading stale page version") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format( - first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" log.info("Cache is clear, reading latest page version without cache") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, NULL, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" @@ -97,9 +99,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -109,9 +109,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format( - reln[0], reln[1], reln[2] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" @@ -123,9 +121,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}', NULL))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -135,7 +131,7 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select * from page_header(get_raw_page('foo', 'main', 0));") raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") def test_read_validation_neg(neon_simple_env: NeonEnv): @@ -144,22 +140,28 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*") - endpoint = env.endpoints.create_start("test_read_validation_neg") - log.info("postgres is running on 'test_read_validation_neg' branch") + endpoint = env.endpoints.create_start( + "test_read_validation_neg", + # Use protocol version 2, because the code that constructs the V1 messages + # assumes that a primary always wants to read the latest version of a page, + # and therefore doesn't work with the test functions below to read an older + # page version. + config_lines=["neon.protocol_version=2"], + ) with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") log.info("read a page of a missing relation") try: c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('Unknown', 'main', 0, '0/0', NULL))" ) raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -167,31 +169,31 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): log.info("read a page at lsn 0") try: c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '0/0', NULL))" ) raise AssertionError("query should have failed") except IoError as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") log.info("Pass NULL as an input") expected = (None, None, None) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn(NULL, 'main', 0, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', NULL, 0, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0'))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', NULL, '0/0', NULL))" ) assert c.fetchone() == expected, "Expected null output" # This check is currently failing, reading beyond EOF is returning a 0-page log.info("Read beyond EOF") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL))" + "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 1, NULL, NULL))" ) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 2d641e36a7..ba8b91e84d 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,8 +1,8 @@ import pytest +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.utils import wait_for_last_record_lsn -from fixtures.types import Lsn from fixtures.utils import query_scalar @@ -16,7 +16,6 @@ def test_readonly_node(neon_simple_env: NeonEnv): env = neon_simple_env env.neon_cli.create_branch("test_readonly_node", "empty") endpoint_main = env.endpoints.create_start("test_readonly_node") - log.info("postgres is running on 'test_readonly_node' branch") env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*") diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 9d7a4a8fd6..e21f9bb6f6 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -10,16 +10,24 @@ from fixtures.neon_fixtures import NeonEnvBuilder # def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): # Override default checkpointer settings to run it more often - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "1048576", + } + ) env.pageserver.is_testing_enabled_or_skip() + # We expect the pageserver to exit, which will cause storage storage controller + # requests to fail and warn. + env.storage_controller.allowed_errors.append(".*management API still failed.*") + env.storage_controller.allowed_errors.append( + ".*Reconcile error.*error sending request for url.*" + ) + # Create a branch for us env.neon_cli.create_branch("test_pageserver_recovery", "main") endpoint = env.endpoints.create_start("test_pageserver_recovery") - log.info("postgres is running on 'test_pageserver_recovery' branch") with closing(endpoint.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 2fda56d0f4..7f79bf5d5c 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -1,6 +1,3 @@ -# It's possible to run any regular test with the local fs remote storage via -# env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... - import os import queue import shutil @@ -9,11 +6,13 @@ import time from typing import Dict, List, Optional, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, wait_for_last_flush_lsn, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( timeline_delete_wait_completed, @@ -27,8 +26,14 @@ from fixtures.remote_storage import ( RemoteStorageKind, available_remote_storages, ) -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import print_gc_result, query_scalar, wait_until +from fixtures.utils import ( + assert_eq, + assert_ge, + assert_gt, + print_gc_result, + query_scalar, + wait_until, +) from requests import ReadTimeout @@ -73,9 +78,6 @@ def test_remote_storage_backup_and_restore( env.pageserver.allowed_errors.extend( [ - # FIXME: Is this expected? - ".*marking .* as locally complete, while it doesnt exist in remote index.*", - ".*No timelines to attach received.*", ".*Failed to get local tenant state.*", # FIXME retry downloads without throwing errors ".*failed to load remote timeline.*", @@ -123,10 +125,10 @@ def test_remote_storage_backup_and_restore( log.info(f"upload of checkpoint {checkpoint_number} is done") # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -144,8 +146,11 @@ def test_remote_storage_backup_and_restore( # Introduce failpoint in list remote timelines code path to make tenant_attach fail. # This is before the failures injected by test_remote_failures, so it's a permanent error. pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return")) - env.pageserver.allowed_errors.append( - ".*attach failed.*: storage-sync-list-remote-timelines", + env.pageserver.allowed_errors.extend( + [ + ".*attach failed.*: storage-sync-list-remote-timelines", + ".*Tenant state is Broken: storage-sync-list-remote-timelines.*", + ] ) # Attach it. This HTTP request will succeed and launch a # background task to load the tenant. In that background task, @@ -159,9 +164,13 @@ def test_remote_storage_backup_and_restore( "data": {"reason": "storage-sync-list-remote-timelines"}, } - # Ensure that even though the tenant is broken, we can't attach it again. - with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"): - env.pageserver.tenant_attach(tenant_id) + # Ensure that even though the tenant is broken, retrying the attachment fails + with pytest.raises(Exception, match="Tenant state is Broken"): + # Use same generation as in previous attempt + gen_state = env.storage_controller.inspect(tenant_id) + assert gen_state is not None + generation = gen_state[0] + env.pageserver.tenant_attach(tenant_id, generation=generation) # Restart again, this implicitly clears the failpoint. # test_remote_failures=1 remains active, though, as it's in the pageserver config. @@ -176,10 +185,8 @@ def test_remote_storage_backup_and_restore( ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint" env.pageserver.start() - # Ensure that the pageserver remembers that the tenant was attaching, by - # trying to attach it again. It should fail. - with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"): - env.pageserver.tenant_attach(tenant_id) + # The attach should have got far enough that it recovers on restart (i.e. tenant's + # config was written to local storage). log.info("waiting for tenant to become active. this should be quick with on-demand download") wait_until_tenant_active( @@ -226,9 +233,9 @@ def test_remote_storage_upload_queue_retries( tenant_id, timeline_id = env.neon_cli.create_tenant( conf={ # small checkpointing and compaction targets to ensure we generate many upload operations - "checkpoint_distance": f"{128 * 1024}", + "checkpoint_distance": f"{64 * 1024}", "compaction_threshold": "1", - "compaction_target_size": f"{128 * 1024}", + "compaction_target_size": f"{64 * 1024}", # no PITR horizon, we specify the horizon when we request on-demand GC "pitr_interval": "0s", # disable background compaction and GC. We invoke it manually when we want it to happen. @@ -236,6 +243,7 @@ def test_remote_storage_upload_queue_retries( "compaction_period": "0s", # create image layers eagerly, so that GC can remove some layers "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", } ) @@ -254,33 +262,30 @@ def test_remote_storage_upload_queue_retries( ] ) + FOO_ROWS_COUNT = 4000 + def overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data): # create initial set of layers & upload them with failpoints configured - endpoint.safe_psql_many( - [ - f""" - INSERT INTO foo (id, val) - SELECT g, '{data}' - FROM generate_series(1, 20000) g - ON CONFLICT (id) DO UPDATE - SET val = EXCLUDED.val - """, - # to ensure that GC can actually remove some layers - "VACUUM foo", - ] - ) + for _v in range(2): + endpoint.safe_psql_many( + [ + f""" + INSERT INTO foo (id, val) + SELECT g, '{data}' + FROM generate_series(1, {FOO_ROWS_COUNT}) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + # to ensure that GC can actually remove some layers + "VACUUM foo", + ] + ) wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) def get_queued_count(file_kind, op_kind): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, + return client.get_remote_timeline_client_queue_count( + tenant_id, timeline_id, file_kind, op_kind ) - assert val is not None, "expecting metric to be present" - return int(val) # create some layers & wait for uploads to finish overwrite_data_and_wait_for_it_to_arrive_at_pageserver("a") @@ -293,9 +298,9 @@ def test_remote_storage_upload_queue_retries( print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) - wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # let all future operations queue up configure_storage_sync_failpoints("return") @@ -323,21 +328,22 @@ def test_remote_storage_upload_queue_retries( churn_while_failpoints_active_thread.start() # wait for churn thread's data to get stuck in the upload queue - wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0) - wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2) - wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0) + # Exponential back-off in upload queue, so, gracious timeouts. + + wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2)) + wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # unblock churn operations configure_storage_sync_failpoints("off") - # ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts. - wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0) - wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0) - wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # The churn thread doesn't make progress once it blocks on the first wait_completion() call, # so, give it some time to wrap up. - churn_while_failpoints_active_thread.join(30) + churn_while_failpoints_active_thread.join(60) assert not churn_while_failpoints_active_thread.is_alive() assert churn_thread_result[0] @@ -348,7 +354,7 @@ def test_remote_storage_upload_queue_retries( env.pageserver.stop(immediate=True) env.endpoints.stop_all() - # We are about to forcibly drop local dirs. Attachment service will increment generation in re-attach before + # We are about to forcibly drop local dirs. Storage controller will increment generation in re-attach before # we later increment when actually attaching it again, leading to skipping a generation and potentially getting # these warnings if there was a durable but un-executed deletion list at time of restart. env.pageserver.allowed_errors.extend( @@ -369,7 +375,7 @@ def test_remote_storage_upload_queue_retries( log.info("restarting postgres to validate") endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) with endpoint.cursor() as cur: - assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000 + assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == FOO_ROWS_COUNT def test_remote_timeline_client_calls_started_metric( @@ -383,6 +389,7 @@ def test_remote_timeline_client_calls_started_metric( initial_tenant_conf={ # small checkpointing and compaction targets to ensure we generate many upload operations "checkpoint_distance": f"{128 * 1024}", + # ensure each timeline_checkpoint() calls creates L1s "compaction_threshold": "1", "compaction_target_size": f"{128 * 1024}", # no PITR horizon, we specify the horizon when we request on-demand GC @@ -390,8 +397,6 @@ def test_remote_timeline_client_calls_started_metric( # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", - # create image layers eagerly, so that GC can remove some layers - "image_creation_threshold": "1", } ) @@ -432,7 +437,7 @@ def test_remote_timeline_client_calls_started_metric( assert timeline_id is not None for (file_kind, op_kind), observations in calls_started.items(): val = client.get_metric_value( - name="pageserver_remote_timeline_client_calls_started_count", + name="pageserver_remote_timeline_client_calls_started_total", filter={ "file_kind": str(file_kind), "op_kind": str(op_kind), @@ -450,12 +455,17 @@ def test_remote_timeline_client_calls_started_metric( ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" def churn(data_pass1, data_pass2): + # overwrite the same data in place, vacuum inbetween, and + # and create image layers; then run a gc(). + # this should + # - create new layers + # - delete some layers overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) - client.timeline_checkpoint(tenant_id, timeline_id) - client.timeline_compact(tenant_id, timeline_id) overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) - client.timeline_checkpoint(tenant_id, timeline_id) - client.timeline_compact(tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) + client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True) gc_result = client.timeline_gc(tenant_id, timeline_id, 0) print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 @@ -535,16 +545,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( client = env.pageserver.http_client() - def get_queued_count(file_kind, op_kind): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, - ) - return int(val) if val is not None else val - endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) client.configure_failpoints(("before-upload-layer", "return")) @@ -578,7 +578,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( def assert_compacted_and_uploads_queued(): assert timeline_path.exists() assert len(list(timeline_path.glob("*"))) >= 8 - assert get_queued_count(file_kind="index", op_kind="upload") > 0 + assert ( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload") + > 0 + ) wait_until(20, 0.1, assert_compacted_and_uploads_queued) @@ -616,7 +619,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( assert len(filtered) == 0 # timeline deletion should kill ongoing uploads, so, the metric will be gone - assert get_queued_count(file_kind="index", op_kind="upload") is None + assert ( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload") + is None + ) # timeline deletion should be unblocking checkpoint ops checkpoint_thread.join(2.0) @@ -702,10 +708,8 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv # index upload is now hitting the failpoint, it should block the shutdown env.pageserver.stop(immediate=True) - local_metadata = ( - env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata" - ) - assert local_metadata.is_file() + timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) + assert timeline_dir.is_dir() assert isinstance(env.pageserver_remote_storage, LocalFsStorage) @@ -826,17 +830,19 @@ def test_compaction_waits_for_upload( assert len(upload_stuck_layers) > 0 for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "while uploads are stuck the layers should be present on disk" + assert env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) + ), "while uploads are stuck the layers should be present on disk" # now this will do the L0 => L1 compaction and want to remove # upload_stuck_layers and the original initdb L0 client.timeline_checkpoint(tenant_id, timeline_id) - # as uploads are paused, the the upload_stuck_layers should still be with us + # as uploads are paused, the upload_stuck_layers should still be with us for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert path.exists(), "uploads are stuck still over compaction" + assert env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) + ), "uploads are stuck still over compaction" compacted_layers = client.layer_map_info(tenant_id, timeline_id).historic_by_name() overlap = compacted_layers.intersection(upload_stuck_layers) @@ -870,9 +876,8 @@ def test_compaction_waits_for_upload( wait_until(10, 1, until_layer_deletes_completed) for name in upload_stuck_layers: - path = env.pageserver.timeline_dir(tenant_id, timeline_id) / name - assert ( - not path.exists() + assert not env.pageserver.layer_exists( + tenant_id, timeline_id, parse_layer_file_name(name) ), "l0 should now be removed because of L0 => L1 compaction and completed uploads" # We should not have hit the error handling path in uploads where a uploaded file is gone @@ -887,26 +892,23 @@ def wait_upload_queue_empty( wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="layer", op_kind="upload" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0 + ), ) wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="index", op_kind="upload" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0 + ), ) wait_until( 2, 1, - lambda: get_queued_count( - client, tenant_id, timeline_id, file_kind="layer", op_kind="delete" - ) - == 0, + lambda: assert_eq( + get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0 + ), ) @@ -917,16 +919,8 @@ def get_queued_count( file_kind: str, op_kind: str, ): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_unfinished", - tenant_id, - timeline_id, - file_kind, - op_kind, - ) - if val is None: - return val - return int(val) + """The most important aspect of this function is shorter name & no return type so asserts are more concise.""" + return client.get_remote_timeline_client_queue_count(tenant_id, timeline_id, file_kind, op_kind) def assert_nothing_to_upload( diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py new file mode 100644 index 0000000000..2360745990 --- /dev/null +++ b/test_runner/regress/test_replication_start.py @@ -0,0 +1,32 @@ +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup + + +@pytest.mark.xfail +def test_replication_start(neon_simple_env: NeonEnv): + env = neon_simple_env + + with env.endpoints.create_start(branch_name="main", endpoint_id="primary") as primary: + with primary.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute("begin") + p_cur.execute("create table t(pk integer primary key, payload integer)") + p_cur.execute("insert into t values (generate_series(1,100000), 0)") + p_cur.execute("select txid_current()") + xid = p_cur.fetchall()[0][0] + log.info(f"Master transaction {xid}") + with env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary" + ) as secondary: + wait_replica_caughtup(primary, secondary) + with secondary.connect() as s_con: + with s_con.cursor() as s_cur: + # Enforce setting hint bits for pg_class tuples. + # If master's transaction is not marked as in-progress in MVCC snapshot, + # then XMIN_INVALID hint bit will be set for table's 't' tuple makeing it invisible. + s_cur.execute("select * from pg_class") + p_cur.execute("commit") + wait_replica_caughtup(primary, secondary) + s_cur.execute("select * from t where pk = 1") + assert s_cur.fetchone() == (1, 0) diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py new file mode 100644 index 0000000000..6383d24c57 --- /dev/null +++ b/test_runner/regress/test_s3_restore.py @@ -0,0 +1,131 @@ +import time +from datetime import datetime, timezone + +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, +) +from fixtures.pageserver.utils import ( + MANY_SMALL_LAYERS_TENANT_CONFIG, + assert_prefix_empty, + enable_remote_storage_versioning, + poll_for_remote_storage_iterations, + tenant_delete_wait_completed, + wait_for_upload, +) +from fixtures.remote_storage import RemoteStorageKind, s3_storage +from fixtures.utils import run_pg_bench_small + + +def test_tenant_s3_restore( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # Mock S3 doesn't have versioning enabled by default, enable it + # (also do it before there is any writes to the bucket) + if remote_storage_kind == RemoteStorageKind.MOCK_S3: + remote_storage = neon_env_builder.pageserver_remote_storage + assert remote_storage, "remote storage not configured" + enable_remote_storage_versioning(remote_storage) + + # change it back after initdb, recovery doesn't work if the two + # index_part.json uploads happen at same second or too close to each other. + initial_tenant_conf = MANY_SMALL_LAYERS_TENANT_CONFIG + del initial_tenant_conf["checkpoint_distance"] + + env = neon_env_builder.init_start(initial_tenant_conf) + env.pageserver.allowed_errors.extend( + [ + # The deletion queue will complain when it encounters simulated S3 errors + ".*deletion executor: DeleteObjects request failed.*", + # lucky race with stopping from flushing a layer we fail to schedule any uploads + ".*layer flush task.+: could not flush frozen layer: update_metadata_file", + ] + ) + + ps_http = env.pageserver.http_client() + tenant_id = env.initial_tenant + + # now lets create the small layers + ps_http.set_tenant_config(tenant_id, MANY_SMALL_LAYERS_TENANT_CONFIG) + + # Default tenant and the one we created + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 + + # create two timelines one being the parent of another, both with non-trivial data + parent = "main" + last_flush_lsns = [] + + for timeline in ["first", "second"]: + timeline_id = env.neon_cli.create_branch( + timeline, tenant_id=tenant_id, ancestor_branch_name=parent + ) + with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql(f"CREATE TABLE created_{timeline}(id integer);") + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + last_flush_lsns.append(last_flush_lsn) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}") + parent = timeline + + # These sleeps are important because they fend off differences in clocks between us and S3 + time.sleep(4) + ts_before_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + assert ( + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 + ), "tenant removed before we deletion was issued" + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + tenant_delete_wait_completed(ps_http, tenant_id, iterations) + ps_http.deletion_queue_flush(execute=True) + assert ( + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + ), "tenant removed before we deletion was issued" + env.storage_controller.attach_hook_drop(tenant_id) + + tenant_path = env.pageserver.tenant_dir(tenant_id) + assert not tenant_path.exists() + + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + time.sleep(4) + ts_after_deletion = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + ps_http.tenant_time_travel_remote_storage( + tenant_id, timestamp=ts_before_deletion, done_if_after=ts_after_deletion + ) + + generation = env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id) + + ps_http.tenant_attach(tenant_id, generation=generation) + env.pageserver.quiesce_tenants() + + for tline in ps_http.timeline_list(env.initial_tenant): + log.info(f"timeline detail: {tline}") + + for i, timeline in enumerate(["first", "second"]): + with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: + endpoint.safe_psql(f"SELECT * FROM created_{timeline};") + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + expected_last_flush_lsn = last_flush_lsns[i] + # There might be some activity that advances the lsn so we can't use a strict equality check + assert last_flush_lsn >= expected_last_flush_lsn, "last_flush_lsn too old" + + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py new file mode 100644 index 0000000000..56075c5975 --- /dev/null +++ b/test_runner/regress/test_sharding.py @@ -0,0 +1,1498 @@ +import os +import time +from collections import defaultdict +from typing import Dict, List, Optional, Union + +import pytest +import requests +from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId +from fixtures.compute_reconfigure import ComputeReconfigure +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + StorageControllerApiException, + StorageScrubber, + last_flush_lsn_upload, + tenant_get_shards, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty +from fixtures.remote_storage import s3_storage +from fixtures.utils import wait_until +from fixtures.workload import Workload +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def test_sharding_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basic lifecycle of a sharded tenant: + - ingested data gets split up + - page service reads + - timeline creation and deletion + - splits + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + + # 1MiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 128 + + # Use S3-compatible remote storage so that we can scrub: this test validates + # that the scrubber doesn't barf when it sees a sharded tenant. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + + neon_env_builder.preserve_database_files = True + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + tenant_id = env.initial_tenant + + pageservers = dict((int(p.id), p) for p in env.pageservers) + shards = env.storage_controller.locate(tenant_id) + + def get_sizes(): + sizes = {} + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + sizes[node_id] = pageserver.http_client().tenant_status(shard["shard_id"])[ + "current_physical_size" + ] + log.info(f"sizes = {sizes}") + return sizes + + # The imported initdb for timeline creation should + # not be fully imported on every shard. We use a 1MB strripe size so expect + # pretty good distribution: no one shard should have more than half the data + sizes = get_sizes() + physical_initdb_total = sum(sizes.values()) + expect_initdb_size = 20 * 1024 * 1024 + assert physical_initdb_total > expect_initdb_size + assert all(s < expect_initdb_size // 2 for s in sizes.values()) + + # Test that timeline creation works on a sharded tenant + timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + + # Test that we can write data to a sharded tenant + workload = Workload(env, tenant_id, timeline_b, branch_name="branch_b") + workload.init() + + sizes_before = get_sizes() + workload.write_rows(256) + + # Test that we can read data back from a sharded tenant + workload.validate() + + # Validate that the data is spread across pageservers + sizes_after = get_sizes() + # Our sizes increased when we wrote data + assert sum(sizes_after.values()) > sum(sizes_before.values()) + # That increase is present on all shards + assert all(sizes_after[ps.id] > sizes_before[ps.id] for ps in env.pageservers) + + # Validate that timeline list API works properly on all shards + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + timelines = set( + TimelineId(tl["timeline_id"]) + for tl in pageserver.http_client().timeline_list(shard["shard_id"]) + ) + assert timelines == {env.initial_timeline, timeline_b} + + env.storage_controller.consistency_check() + + # Validate that deleting a sharded tenant removes all files in the prefix + + # Before deleting, stop the client and check we have some objects to delete + workload.stop() + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + # Check the scrubber isn't confused by sharded content, then disable + # it during teardown because we'll have deleted by then + StorageScrubber(neon_env_builder).scan_metadata() + neon_env_builder.scrub_on_exit = False + + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + env.storage_controller.consistency_check() + + +def test_sharding_split_unsharded( + neon_env_builder: NeonEnvBuilder, +): + """ + Test that shard splitting works on a tenant created as unsharded (i.e. with + ShardCount(0)). + """ + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Check that we created with an unsharded TenantShardId: this is the default, + # but check it in case we change the default in future + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None + + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + workload.write_rows(256) + workload.validate() + + # Split one shard into two + env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) + + # Check we got the shard IDs we expected + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None + assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None + + workload.validate() + + env.storage_controller.consistency_check() + + +@pytest.mark.parametrize( + "failpoint", + [ + None, + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + "compact-shard-ancestors-persistent", + ], +) +def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]): + """ + Test that after a split, we clean up parent layer data in the child shards via compaction. + """ + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "3600s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + } + + neon_env_builder.storage_controller_config = { + # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. + "max_unavailable": "300s" + } + + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Check that we created with an unsharded TenantShardId: this is the default, + # but check it in case we change the default in future + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None + + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + workload.write_rows(256) + workload.validate() + workload.stop() + + # Split one shard into two + shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) + + # Let all shards move into their stable locations, so that during subsequent steps we + # don't have reconciles in progress (simpler to reason about what messages we expect in logs) + env.storage_controller.reconcile_until_idle() + + # Check we got the shard IDs we expected + assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None + assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None + + workload.validate() + workload.stop() + + env.storage_controller.consistency_check() + + # Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant + for shard in shards: + ps = env.get_tenant_pageserver(shard) + + # Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes + detail_before = ps.http_client().timeline_detail(shard, timeline_id) + ps.http_client().timeline_compact(shard, timeline_id) + detail_after = ps.http_client().timeline_detail(shard, timeline_id) + + # Physical size should shrink because some layers have been dropped + assert detail_after["current_physical_size"] < detail_before["current_physical_size"] + + # Compaction shouldn't make anything unreadable + workload.validate() + + # Force a generation increase: layer rewrites are a long-term thing and only happen after + # the generation has increased. + env.pageserver.stop() + env.pageserver.start() + + # Cleanup part 2: once layers are outside the PITR window, they will be rewritten if they are partially redundant + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, {"pitr_interval": "0s"}) + env.storage_controller.reconcile_until_idle() + + for shard in shards: + ps = env.get_tenant_pageserver(shard) + + # Apply failpoints for the layer-rewriting phase: this is the area of code that has sensitive behavior + # across restarts, as we will have local layer files that temporarily disagree with the remote metadata + # for the same local layer file name. + if failpoint is not None: + ps.http_client().configure_failpoints((failpoint, "exit")) + + # Do a GC to update gc_info (compaction uses this to decide whether a layer is to be rewritten) + # Set gc_horizon=0 to let PITR horizon control GC cutoff exclusively. + ps.http_client().timeline_gc(shard, timeline_id, gc_horizon=0) + + # We will compare stats before + after compaction + detail_before = ps.http_client().timeline_detail(shard, timeline_id) + + # Invoke compaction: this should rewrite layers that are behind the pitr horizon + try: + ps.http_client().timeline_compact(shard, timeline_id) + except requests.ConnectionError as e: + if failpoint is None: + raise e + else: + log.info(f"Compaction failed (failpoint={failpoint}): {e}") + + if failpoint in ( + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + ): + # If we left local files that don't match remote metadata, we expect warnings on next startup + env.pageserver.allowed_errors.append( + ".*removing local file .+ because it has unexpected length.*" + ) + + # Post-failpoint: we check that the pageserver comes back online happily. + env.pageserver.running = False + env.pageserver.start() + else: + assert failpoint is None # We shouldn't reach success path if a failpoint was set + + detail_after = ps.http_client().timeline_detail(shard, timeline_id) + + # Physical size should shrink because layers are smaller + assert detail_after["current_physical_size"] < detail_before["current_physical_size"] + + # Validate size statistics + for shard in shards: + ps = env.get_tenant_pageserver(shard) + timeline_info = ps.http_client().timeline_detail(shard, timeline_id) + reported_size = timeline_info["current_physical_size"] + layer_paths = ps.list_layers(shard, timeline_id) + measured_size = 0 + for p in layer_paths: + abs_path = ps.timeline_dir(shard, timeline_id) / p + measured_size += os.stat(abs_path).st_size + + log.info( + f"shard {shard} reported size {reported_size}, measured size {measured_size} ({len(layer_paths)} layers)" + ) + + if failpoint in ( + "compact-shard-ancestors-localonly", + "compact-shard-ancestors-enqueued", + ): + # If we injected a failure between local rewrite and remote upload, then after + # restart we may end up with neither version of the file on local disk (the new file + # is cleaned up because it doesn't matchc remote metadata). So local size isn't + # necessarily going to match remote physical size. + continue + + assert measured_size == reported_size + + # Compaction shouldn't make anything unreadable + workload.validate() + + +def test_sharding_split_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basics of shard splitting: + - The API results in more shards than we started with + - The tenant's data remains readable + + """ + + # We will start with 4 shards and split into 8, then migrate all those + # 8 shards onto separate pageservers + shard_count = 4 + split_shard_count = 8 + neon_env_builder.num_pageservers = split_shard_count * 2 + + # 1MiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 128 + + # Use S3-compatible remote storage so that we can scrub: this test validates + # that the scrubber doesn't barf when it sees a sharded tenant. + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.enable_scrub_on_exit() + + neon_env_builder.preserve_database_files = True + + non_default_tenant_config = {"gc_horizon": 77 * 1024 * 1024} + + env = neon_env_builder.init_configs(True) + neon_env_builder.start() + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, + timeline_id, + shard_count=shard_count, + shard_stripe_size=stripe_size, + placement_policy='{"Attached": 1}', + conf=non_default_tenant_config, + ) + + workload = Workload(env, tenant_id, timeline_id, branch_name="main") + workload.init() + + # Initial data + workload.write_rows(256) + + # Note which pageservers initially hold a shard after tenant creation + pre_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] + + # For pageservers holding a shard, validate their ingest statistics + # reflect a proper splitting of the WAL. + for pageserver in env.pageservers: + if pageserver.id not in pre_split_pageserver_ids: + continue + + metrics = pageserver.http_client().get_metrics_values( + [ + "pageserver_wal_ingest_records_received_total", + "pageserver_wal_ingest_records_committed_total", + "pageserver_wal_ingest_records_filtered_total", + ] + ) + + log.info(f"Pageserver {pageserver.id} metrics: {metrics}") + + # Not everything received was committed + assert ( + metrics["pageserver_wal_ingest_records_received_total"] + > metrics["pageserver_wal_ingest_records_committed_total"] + ) + + # Something was committed + assert metrics["pageserver_wal_ingest_records_committed_total"] > 0 + + # Counts are self consistent + assert ( + metrics["pageserver_wal_ingest_records_received_total"] + == metrics["pageserver_wal_ingest_records_committed_total"] + + metrics["pageserver_wal_ingest_records_filtered_total"] + ) + + # TODO: validate that shards have different sizes + + workload.validate() + + assert len(pre_split_pageserver_ids) == 4 + + def shards_on_disk(shard_ids): + for pageserver in env.pageservers: + for shard_id in shard_ids: + if pageserver.tenant_dir(shard_id).exists(): + return True + + return False + + old_shard_ids = [TenantShardId(tenant_id, i, shard_count) for i in range(0, shard_count)] + # Before split, old shards exist + assert shards_on_disk(old_shard_ids) + + # Before split, we have done one reconcile for each shard + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + == shard_count + ) + + # Make secondary downloads slow: this exercises the storage controller logic for not migrating an attachment + # during post-split optimization until the secondary is ready + for ps in env.pageservers: + ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")]) + + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) + + post_split_pageserver_ids = [loc["node_id"] for loc in env.storage_controller.locate(tenant_id)] + # We should have split into 8 shards, on the same 4 pageservers we started on. + assert len(post_split_pageserver_ids) == split_shard_count + assert len(set(post_split_pageserver_ids)) == shard_count + assert set(post_split_pageserver_ids) == set(pre_split_pageserver_ids) + + # The old parent shards should no longer exist on disk + assert not shards_on_disk(old_shard_ids) + + # Enough background reconciliations should result in the shards being properly distributed. + # Run this before the workload, because its LSN-waiting code presumes stable locations. + env.storage_controller.reconcile_until_idle(timeout_secs=60) + + workload.validate() + + workload.churn_rows(256) + + workload.validate() + + # Run GC on all new shards, to check they don't barf or delete anything that breaks reads + # (compaction was already run as part of churn_rows) + all_shards = tenant_get_shards(env, tenant_id) + for tenant_shard_id, pageserver in all_shards: + pageserver.http_client().timeline_gc(tenant_shard_id, timeline_id, None) + workload.validate() + + # Assert on how many reconciles happened during the process. This is something of an + # implementation detail, but it is useful to detect any bugs that might generate spurious + # extra reconcile iterations. + # + # We'll have: + # - shard_count reconciles for the original setup of the tenant + # - shard_count reconciles for detaching the original secondary locations during split + # - split_shard_count reconciles during shard splitting, for setting up secondaries. + # - shard_count of the child shards will need to fail over to their secondaries + # - shard_count of the child shard secondary locations will get moved to emptier nodes + expect_reconciles = shard_count * 2 + split_shard_count + shard_count * 2 + reconcile_ok = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + assert reconcile_ok == expect_reconciles + + # Check that no cancelled or errored reconciliations occurred: this test does no + # failure injection and should run clean. + cancelled_reconciles = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "cancel"} + ) + errored_reconciles = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "error"} + ) + assert cancelled_reconciles is not None and int(cancelled_reconciles) == 0 + assert errored_reconciles is not None and int(errored_reconciles) == 0 + + # We should see that the migration of shards after the split waited for secondaries to warm up + # before happening + assert env.storage_controller.log_contains(".*Skipping.*because secondary isn't ready.*") + + env.storage_controller.consistency_check() + + def get_node_shard_counts(env: NeonEnv, tenant_ids): + total: defaultdict[int, int] = defaultdict(int) + attached: defaultdict[int, int] = defaultdict(int) + for tid in tenant_ids: + for shard in env.storage_controller.tenant_describe(tid)["shards"]: + log.info( + f"{shard['tenant_shard_id']}: attached={shard['node_attached']}, secondary={shard['node_secondary']} " + ) + for node in shard["node_secondary"]: + total[int(node)] += 1 + attached[int(shard["node_attached"])] += 1 + total[int(shard["node_attached"])] += 1 + + return total, attached + + def check_effective_tenant_config(): + # Expect our custom tenant configs to have survived the split + for shard in env.storage_controller.tenant_describe(tenant_id)["shards"]: + node = env.get_pageserver(int(shard["node_attached"])) + config = node.http_client().tenant_config(TenantShardId.parse(shard["tenant_shard_id"])) + for k, v in non_default_tenant_config.items(): + assert config.effective_config[k] == v + + # Validate pageserver state: expect every child shard to have an attached and secondary location + (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) + assert sum(attached.values()) == split_shard_count + assert sum(total.values()) == split_shard_count * 2 + check_effective_tenant_config() + + # More specific check: that we are fully balanced. This is deterministic because + # the order in which we consider shards for optimization is deterministic, and the + # order of preference of nodes is also deterministic (lower node IDs win). + log.info(f"total: {total}") + assert total == { + 1: 1, + 2: 1, + 3: 1, + 4: 1, + 5: 1, + 6: 1, + 7: 1, + 8: 1, + 9: 1, + 10: 1, + 11: 1, + 12: 1, + 13: 1, + 14: 1, + 15: 1, + 16: 1, + } + log.info(f"attached: {attached}") + assert attached == {1: 1, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1, 9: 1, 11: 1} + + # Ensure post-split pageserver locations survive a restart (i.e. the child shards + # correctly wrote config to disk, and the storage controller responds correctly + # to /re-attach) + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + # Validate pageserver state: expect every child shard to have an attached and secondary location + (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) + assert sum(attached.values()) == split_shard_count + assert sum(total.values()) == split_shard_count * 2 + check_effective_tenant_config() + + workload.validate() + + +@pytest.mark.parametrize("initial_stripe_size", [None, 65536]) +def test_sharding_split_stripe_size( + neon_env_builder: NeonEnvBuilder, + httpserver: HTTPServer, + httpserver_listen_address, + initial_stripe_size: int, +): + """ + Check that modifying stripe size inline with a shard split works as expected + """ + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.num_pageservers = 1 + + # Set up fake HTTP notify endpoint: we will use this to validate that we receive + # the correct stripe size after split. + notifications = [] + + def handler(request: Request): + log.info(f"Notify request: {request}") + notifications.append(request.json) + return Response(status=200) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + env = neon_env_builder.init_start( + initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size + ) + tenant_id = env.initial_tenant + + assert len(notifications) == 1 + expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + } + assert notifications[0] == expect + + new_stripe_size = 2048 + env.storage_controller.tenant_shard_split( + tenant_id, shard_count=2, shard_stripe_size=new_stripe_size + ) + env.storage_controller.reconcile_until_idle() + + # Check that we ended up with the stripe size that we expected, both on the pageserver + # and in the notifications to compute + assert len(notifications) == 2 + expect_after: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": new_stripe_size, + "shards": [ + {"node_id": int(env.pageservers[0].id), "shard_number": 0}, + {"node_id": int(env.pageservers[0].id), "shard_number": 1}, + ], + } + log.info(f"Got notification: {notifications[1]}") + assert notifications[1] == expect_after + + # Inspect the stripe size on the pageserver + shard_0_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2)) + ) + assert shard_0_loc["shard_stripe_size"] == new_stripe_size + shard_1_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2)) + ) + assert shard_1_loc["shard_stripe_size"] == new_stripe_size + + # Ensure stripe size survives a pageserver restart + env.pageservers[0].stop() + env.pageservers[0].start() + shard_0_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 0, 2)) + ) + assert shard_0_loc["shard_stripe_size"] == new_stripe_size + shard_1_loc = ( + env.pageservers[0].http_client().tenant_get_location(TenantShardId(tenant_id, 1, 2)) + ) + assert shard_1_loc["shard_stripe_size"] == new_stripe_size + + # Ensure stripe size survives a storage controller restart + env.storage_controller.stop() + env.storage_controller.start() + + def assert_restart_notification(): + assert len(notifications) == 3 + assert notifications[2] == expect_after + + wait_until(10, 1, assert_restart_notification) + + +@pytest.mark.skipif( + # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're + # validating in this test don't benefit much from debug assertions. + os.getenv("BUILD_TYPE") == "debug", + reason="Avoid running bulkier ingest tests in debug mode", +) +def test_sharding_ingest_layer_sizes( + neon_env_builder: NeonEnvBuilder, +): + """ + Check that when ingesting data to a sharded tenant, we properly respect layer size limts. + """ + + # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic + # without writing a lot of data. + expect_layer_size = 131072 + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{expect_layer_size}", + "compaction_target_size": f"{expect_layer_size}", + # aim to reduce flakyness, we are not doing explicit checkpointing + "compaction_period": "0s", + "gc_period": "0s", + } + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + # A stripe size the same order of magnitude as layer size: this ensures that + # within checkpoint_distance some shards will have no data to ingest, if LSN + # contains sequential page writes. This test checks that this kind of + # scenario doesn't result in some shards emitting empty/tiny layers. + initial_tenant_shard_stripe_size=expect_layer_size // 8192, + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # ignore the initdb layer(s) for the purposes of the size comparison as a initdb image layer optimization + # will produce a lot more smaller layers. + initial_layers_per_shard = {} + log.info("initdb distribution (not asserted on):") + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layers = ( + env.get_pageserver(shard["node_id"]).http_client().layer_map_info(shard_id, timeline_id) + ) + for layer in layers.historic_layers: + log.info( + f"layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size})" + ) + + initial_layers_per_shard[shard_id] = set(layers.historic_layers) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + + workload.validate() + + small_layer_count = 0 + ok_layer_count = 0 + huge_layer_count = 0 + + # Inspect the resulting layer map, count how many layers are undersized. + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) + + historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start) + + initial_layers = initial_layers_per_shard[shard_id] + + for layer in historic_layers: + if layer in initial_layers: + # ignore the initdb image layers for the size histogram + continue + + if layer.layer_file_size < expect_layer_size // 2: + classification = "Small" + small_layer_count += 1 + elif layer.layer_file_size > expect_layer_size * 2: + classification = "Huge " + huge_layer_count += 1 + else: + classification = "OK " + ok_layer_count += 1 + + if layer.kind == "Delta": + assert layer.lsn_end is not None + lsn_size = Lsn(layer.lsn_end) - Lsn(layer.lsn_start) + else: + lsn_size = 0 + + log.info( + f"{classification} layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size}, LSN distance {lsn_size})" + ) + + # Why an inexact check? + # - Because we roll layers on checkpoint_distance * shard_count, we expect to obey the target + # layer size on average, but it is still possible to write some tiny layers. + log.info(f"Totals: {small_layer_count} small layers, {ok_layer_count} ok layers") + if small_layer_count <= shard_count: + # If each shard has <= 1 small layer + pass + else: + # General case: + # old limit was 0.25 but pg14 is right at the limit with 7/28 + assert float(small_layer_count) / float(ok_layer_count) < 0.3 + + # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance. + assert huge_layer_count <= shard_count + + +def test_sharding_ingest_gaps( + neon_env_builder: NeonEnvBuilder, +): + """ + Check ingest behavior when the incoming data results in some shards having gaps where + no data is ingested: they should advance their disk_consistent_lsn and remote_consistent_lsn + even if they aren't writing out layers. + """ + + # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic + # without writing a lot of data. + expect_layer_size = 131072 + checkpoint_interval_secs = 5 + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{expect_layer_size}", + "compaction_target_size": f"{expect_layer_size}", + # Set a short checkpoint interval as we will wait for uploads to happen + "checkpoint_timeout": f"{checkpoint_interval_secs}s", + # Background checkpointing is done from compaction loop, so set that interval short too + "compaction_period": "1s", + } + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + initial_tenant_shard_stripe_size=128, + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Just a few writes: we aim to produce a situation where some shards are skipping + # ingesting some records and thereby won't have layer files that advance their + # consistent LSNs, to exercise the code paths that explicitly handle this case by + # advancing consistent LSNs in the background if there is no open layer. + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128, upload=False) + workload.churn_rows(128, upload=False) + + # Checkpoint, so that we won't get a background checkpoint happening during the next step + workload.endpoint().safe_psql("checkpoint") + # Freeze + flush, so that subsequent writes will start from a position of no open layers + last_flush_lsn_upload(env, workload.endpoint(), tenant_id, timeline_id) + + # This write is tiny: at least some of the shards should find they don't have any + # data to ingest. This will exercise how they handle that. + workload.churn_rows(1, upload=False) + + # The LSN that has reached pageservers, but may not have been flushed to historic layers yet + expect_lsn = wait_for_last_flush_lsn(env, workload.endpoint(), tenant_id, timeline_id) + + # Don't leave the endpoint running, we don't want it writing in the background + workload.stop() + + log.info(f"Waiting for shards' consistent LSNs to reach {expect_lsn}") + + shards = tenant_get_shards(env, tenant_id, None) + + def assert_all_disk_consistent(): + """ + Assert that all the shards' disk_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent) + + def assert_all_remote_consistent(): + """ + Assert that all the shards' remote_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent) + + workload.validate() + + +class Failure: + pageserver_id: Optional[int] + + def apply(self, env: NeonEnv): + raise NotImplementedError() + + def clear(self, env: NeonEnv): + """ + Clear the failure, in a way that should enable the system to proceed + to a totally clean state (all nodes online and reconciled) + """ + raise NotImplementedError() + + def expect_available(self): + raise NotImplementedError() + + def can_mitigate(self): + """Whether Self.mitigate is available for use""" + return False + + def mitigate(self, env: NeonEnv): + """ + Mitigate the failure in a way that should allow shard split to + complete and service to resume, but does not guarantee to leave + the whole world in a clean state (e.g. an Offline node might have + junk LocationConfigs on it) + """ + raise NotImplementedError() + + def fails_forward(self, env: NeonEnv): + """ + If true, this failure results in a state that eventualy completes the split. + """ + return False + + def expect_exception(self): + """ + How do we expect a call to the split API to fail? + """ + return StorageControllerApiException + + +class PageserverFailpoint(Failure): + def __init__(self, failpoint, pageserver_id, mitigate): + self.failpoint = failpoint + self.pageserver_id = pageserver_id + self._mitigate = mitigate + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.allowed_errors.extend( + [".*failpoint.*", ".*Resetting.*after shard split failure.*"] + ) + pageserver.http_client().configure_failpoints((self.failpoint, "return(1)")) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "off")) + if self._mitigate: + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Active"}) + + def expect_available(self): + return True + + def can_mitigate(self): + return self._mitigate + + def mitigate(self, env): + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"}) + + +class StorageControllerFailpoint(Failure): + def __init__(self, failpoint, action): + self.failpoint = failpoint + self.pageserver_id = None + self.action = action + + def apply(self, env: NeonEnv): + env.storage_controller.configure_failpoints((self.failpoint, self.action)) + + def clear(self, env: NeonEnv): + if "panic" in self.action: + log.info("Restarting storage controller after panic") + env.storage_controller.stop() + env.storage_controller.start() + else: + env.storage_controller.configure_failpoints((self.failpoint, "off")) + + def expect_available(self): + # Controller panics _do_ leave pageservers available, but our test code relies + # on using the locate API to update configurations in Workload, so we must skip + # these actions when the controller has been panicked. + return "panic" not in self.action + + def can_mitigate(self): + return False + + def fails_forward(self, env): + # Edge case: the very last failpoint that simulates a DB connection error, where + # the abort path will fail-forward and result in a complete split. + fail_forward = self.failpoint == "shard-split-post-complete" + + # If the failure was a panic, then if we expect split to eventually (after restart) + # complete, we must restart before checking that. + if fail_forward and "panic" in self.action: + log.info("Restarting storage controller after panic") + env.storage_controller.stop() + env.storage_controller.start() + + return fail_forward + + def expect_exception(self): + if "panic" in self.action: + return requests.exceptions.ConnectionError + else: + return StorageControllerApiException + + +class NodeKill(Failure): + def __init__(self, pageserver_id, mitigate): + self.pageserver_id = pageserver_id + self._mitigate = mitigate + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.stop(immediate=True) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.start() + + def expect_available(self): + return False + + def mitigate(self, env): + env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"}) + + +class CompositeFailure(Failure): + """ + Wrapper for failures in multiple components (e.g. a failpoint in the storage controller, *and* + stop a pageserver to interfere with rollback) + """ + + def __init__(self, failures: list[Failure]): + self.failures = failures + + self.pageserver_id = None + for f in failures: + if f.pageserver_id is not None: + self.pageserver_id = f.pageserver_id + break + + def apply(self, env: NeonEnv): + for f in self.failures: + f.apply(env) + + def clear(self, env): + for f in self.failures: + f.clear(env) + + def expect_available(self): + return all(f.expect_available() for f in self.failures) + + def mitigate(self, env): + for f in self.failures: + f.mitigate(env) + + def expect_exception(self): + expect = set(f.expect_exception() for f in self.failures) + + # We can't give a sensible response if our failures have different expectations + assert len(expect) == 1 + + return list(expect)[0] + + +@pytest.mark.parametrize( + "failure", + [ + PageserverFailpoint("api-500", 1, False), + NodeKill(1, False), + PageserverFailpoint("api-500", 1, True), + NodeKill(1, True), + PageserverFailpoint("shard-split-pre-prepare", 1, False), + PageserverFailpoint("shard-split-post-prepare", 1, False), + PageserverFailpoint("shard-split-pre-hardlink", 1, False), + PageserverFailpoint("shard-split-post-hardlink", 1, False), + PageserverFailpoint("shard-split-post-child-conf", 1, False), + PageserverFailpoint("shard-split-lsn-wait", 1, False), + PageserverFailpoint("shard-split-pre-finish", 1, False), + StorageControllerFailpoint("shard-split-validation", "return(1)"), + StorageControllerFailpoint("shard-split-post-begin", "return(1)"), + StorageControllerFailpoint("shard-split-post-remote", "return(1)"), + StorageControllerFailpoint("shard-split-post-complete", "return(1)"), + StorageControllerFailpoint("shard-split-validation", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-begin", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-remote", "panic(failpoint)"), + StorageControllerFailpoint("shard-split-post-complete", "panic(failpoint)"), + CompositeFailure( + [NodeKill(1, True), StorageControllerFailpoint("shard-split-post-begin", "return(1)")] + ), + CompositeFailure( + [NodeKill(1, False), StorageControllerFailpoint("shard-split-post-begin", "return(1)")] + ), + ], +) +def test_sharding_split_failures( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, + failure: Failure, +): + neon_env_builder.num_pageservers = 4 + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + initial_shard_count = 2 + split_shard_count = 4 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + + # Create a tenant with secondary locations enabled + env.neon_cli.create_tenant( + tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # We exercise failure cases where abort itself will also fail (node offline) + ".*abort_tenant_shard_split.*", + ".*Failed to abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + # Node offline cases will fail to send requests + ".*Reconcile error: receive body: error sending request for url.*", + # Node offline cases will fail inside reconciler when detaching secondaries + ".*Reconcile error on shard.*: receive body: error sending request for url.*", + # Node offline cases may eventually cancel reconcilers when the heartbeater realizes nodes are offline + ".*Reconcile error.*Cancelled.*", + # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning + ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*", + ] + ) + + for ps in env.pageservers: + # When we do node failures and abandon a shard, it will de-facto have old generation and + # thereby be unable to publish remote consistent LSN updates + ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") + + # If we're using a failure that will panic the storage controller, all background + # upcalls from the pageserver can fail + ps.allowed_errors.append(".*calling control plane generation validation API failed.*") + + # Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything + assert ( + failure.pageserver_id is None + or len( + env.get_pageserver(failure.pageserver_id) + .http_client() + .tenant_list_locations()["tenant_shards"] + ) + > 0 + ) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(100) + + # Put the environment into a failing state (exact meaning depends on `failure`) + failure.apply(env) + + with pytest.raises(failure.expect_exception()): + env.storage_controller.tenant_shard_split(tenant_id, shard_count=4) + + # We expect that the overall operation will fail, but some split requests + # will have succeeded: the net result should be to return to a clean state, including + # detaching any child shards. + def assert_rolled_back(exclude_ps_id=None) -> None: + secondary_count = 0 + attached_count = 0 + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") + assert tenant_shard_id.shard_count == initial_shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + + if exclude_ps_id is not None: + # For a node failure case, we expect there to be a secondary location + # scheduled on the offline node, so expect one fewer secondary in total + assert secondary_count == initial_shard_count - 1 + else: + assert secondary_count == initial_shard_count + + assert attached_count == initial_shard_count + + def assert_split_done(exclude_ps_id=None) -> None: + secondary_count = 0 + attached_count = 0 + for ps in env.pageservers: + if exclude_ps_id is not None and ps.id == exclude_ps_id: + continue + + locations = ps.http_client().tenant_list_locations()["tenant_shards"] + for loc in locations: + tenant_shard_id = TenantShardId.parse(loc[0]) + log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}") + assert tenant_shard_id.shard_count == split_shard_count + if loc[1]["mode"] == "Secondary": + secondary_count += 1 + else: + attached_count += 1 + assert attached_count == split_shard_count + assert secondary_count == split_shard_count + + def finish_split(): + # Having failed+rolled back, we should be able to split again + # No failures this time; it will succeed + env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count) + env.storage_controller.reconcile_until_idle(timeout_secs=30) + + workload.churn_rows(10) + workload.validate() + + if failure.expect_available(): + # Even though the split failed partway through, this should not leave the tenant in + # an unavailable state. + # - Disable waiting for pageservers in the workload helper, because our + # failpoints may prevent API access. This only applies for failure modes that + # leave pageserver page_service API available. + # - This is a wait_until because clients may see transient errors in some split error cases, + # e.g. while waiting for a storage controller to re-attach a parent shard if we failed + # inside the pageserver and the storage controller responds by detaching children and attaching + # parents concurrently (https://github.com/neondatabase/neon/issues/7148) + wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) # type: ignore + + workload.validate() + + if failure.fails_forward(env): + log.info("Fail-forward failure, checking split eventually completes...") + # A failure type which results in eventual completion of the split + wait_until(30, 1, assert_split_done) + elif failure.can_mitigate(): + log.info("Mitigating failure...") + # Mitigation phase: we expect to be able to proceed with a successful shard split + failure.mitigate(env) + + # The split should appear to be rolled back from the point of view of all pageservers + # apart from the one that is offline + wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) + + finish_split() + wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id)) + + # Having cleared the failure, everything should converge to a pristine state + failure.clear(env) + wait_until(30, 1, assert_split_done) + else: + # Once we restore the faulty pageserver's API to good health, rollback should + # eventually complete. + log.info("Clearing failure...") + failure.clear(env) + + wait_until(30, 1, assert_rolled_back) + + # Having rolled back, the tenant should be working + workload.churn_rows(10) + workload.validate() + + # Splitting again should work, since we cleared the failure + finish_split() + assert_split_done() + + if isinstance(failure, StorageControllerFailpoint) and "post-complete" in failure.failpoint: + # On a post-complete failure, the controller will recover the post-split state + # after restart, but it will have missed the optimization part of the split function + # where secondary downloads are kicked off. This means that reconcile_until_idle + # will take a very long time if we wait for all optimizations to complete, because + # those optimizations will wait for secondary downloads. + # + # Avoid that by configuring the tenant into Essential scheduling mode, so that it will + # skip optimizations when we're exercising this particular failpoint. + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"}) + + # Having completed the split, pump the background reconciles to ensure that + # the scheduler reaches an idle state + env.storage_controller.reconcile_until_idle(timeout_secs=30) + + env.storage_controller.consistency_check() + + +def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): + """ + Check a scenario when one of the shards is much slower than others. + Without backpressure, this would lead to the slow shard falling behind + and eventually causing WAL timeouts. + """ + + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + + # 256KiB stripes: enable getting some meaningful data distribution without + # writing large quantities of data in this test. The stripe size is given + # in number of 8KiB pages. + stripe_size = 32 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageservers = dict((int(p.id), p) for p in env.pageservers) + shards = env.storage_controller.locate(tenant_id) + + # Slow down one of the shards, around ~1MB/s + pageservers[4].http_client().configure_failpoints(("wal-ingest-record-sleep", "5%sleep(1)")) + + def shards_info(): + infos = [] + for shard in shards: + node_id = int(shard["node_id"]) + pageserver = pageservers[node_id] + shard_info = pageserver.http_client().timeline_detail(shard["shard_id"], timeline_id) + infos.append(shard_info) + last_record_lsn = shard_info["last_record_lsn"] + current_physical_size = shard_info["current_physical_size"] + log.info( + f"Shard on pageserver {node_id}: lsn={last_record_lsn}, size={current_physical_size}" + ) + return infos + + shards_info() + + workload = Workload( + env, + tenant_id, + timeline_id, + branch_name="main", + endpoint_opts={ + "config_lines": [ + # Tip: set to 100MB to make the test fail + "max_replication_write_lag=1MB", + ], + }, + ) + workload.init() + + endpoint = workload.endpoint() + + # on 2024-03-05, the default config on prod was [15MB, 10GB, null] + res = endpoint.safe_psql_many( + [ + "SHOW max_replication_write_lag", + "SHOW max_replication_flush_lag", + "SHOW max_replication_apply_lag", + ] + ) + log.info(f"backpressure config: {res}") + + last_flush_lsn = None + last_timestamp = None + + def update_write_lsn(): + nonlocal last_flush_lsn + nonlocal last_timestamp + + res = endpoint.safe_psql( + """ + SELECT + pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag, + received_lsn, + pg_current_wal_flush_lsn() as flush_lsn, + neon.backpressure_throttling_time() as throttling_time + FROM neon.backpressure_lsns(); + """, + dbname="postgres", + )[0] + log.info( + f"received_lsn_lag = {res[0]}, received_lsn = {res[1]}, flush_lsn = {res[2]}, throttling_time = {res[3]}" + ) + + lsn = Lsn(res[2]) + now = time.time() + + if last_timestamp is not None: + delta = now - last_timestamp + delta_bytes = lsn - last_flush_lsn + avg_speed = delta_bytes / delta / 1024 / 1024 + log.info( + f"flush_lsn {lsn}, written {delta_bytes/1024}kb for {delta:.3f}s, avg_speed {avg_speed:.3f} MiB/s" + ) + + last_flush_lsn = lsn + last_timestamp = now + + update_write_lsn() + + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.write_rows(4096, upload=False) + workload.validate() + + update_write_lsn() + shards_info() + + for _write_iter in range(30): + # approximately 1MB of data + workload.write_rows(8000, upload=False) + update_write_lsn() + infos = shards_info() + min_lsn = min(Lsn(info["last_record_lsn"]) for info in infos) + max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos) + diff = max_lsn - min_lsn + assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure" + + +def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): + """ + Check that an unlogged relation is handled properly on a sharded tenant + + Reproducer for https://github.com/neondatabase/neon/issues/7451 + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + neon_env_builder.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8) + + # We will create many tables to ensure it's overwhelmingly likely that at least one + # of them doesn't land on shard 0 + table_names = [f"my_unlogged_{i}" for i in range(0, 16)] + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));") + ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(1, "foo")] + ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);") + + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as ep: + for table_name in table_names: + # Check that table works: we can select and insert + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [] + ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');") + result = ep.safe_psql(f"SELECT * from {table_name};") + assert result == [(2, "bar")] + + # Ensure that post-endpoint-restart modifications are ingested happily by pageserver + wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id) + + +def test_top_tenants(neon_env_builder: NeonEnvBuilder): + """ + The top_tenants API is used in shard auto-splitting to find candidates. + """ + + env = neon_env_builder.init_configs() + neon_env_builder.start() + + tenants = [] + n_tenants = 8 + for i in range(0, n_tenants): + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id) + + # Write a different amount of data to each tenant + w = Workload(env, tenant_id, timeline_id) + w.init() + w.write_rows(i * 1000) + w.stop() + + logical_size = env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)[ + "current_logical_size" + ] + tenants.append((tenant_id, timeline_id, logical_size)) + + log.info(f"Created {tenant_id}/{timeline_id} with size {logical_size}") + + # Ask for 1 largest tenant + top_1 = env.pageserver.http_client().top_tenants("max_logical_size", 1, 8, 0) + assert len(top_1["shards"]) == 1 + assert top_1["shards"][0]["id"] == str(tenants[-1][0]) + assert top_1["shards"][0]["max_logical_size"] == tenants[-1][2] + + # Apply a lower bound limit + top = env.pageserver.http_client().top_tenants( + "max_logical_size", 100, 8, where_gt=tenants[3][2] + ) + assert len(top["shards"]) == n_tenants - 4 + assert set(i["id"] for i in top["shards"]) == set(str(i[0]) for i in tenants[4:]) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py new file mode 100644 index 0000000000..30f96ceee8 --- /dev/null +++ b/test_runner/regress/test_storage_controller.py @@ -0,0 +1,1621 @@ +import json +import threading +import time +from collections import defaultdict +from datetime import datetime, timezone +from typing import Any, Dict, List, Union + +import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.compute_reconfigure import ComputeReconfigure +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, + PgBin, + StorageControllerApiException, + TokenScope, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import ( + MANY_SMALL_LAYERS_TENANT_CONFIG, + assert_prefix_empty, + assert_prefix_not_empty, + enable_remote_storage_versioning, + list_prefix, + remote_storage_delete_key, + tenant_delete_wait_completed, + timeline_delete_wait_completed, +) +from fixtures.pg_version import PgVersion +from fixtures.remote_storage import RemoteStorageKind, s3_storage +from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until +from fixtures.workload import Workload +from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, +) +from pytest_httpserver import HTTPServer +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response + + +def get_node_shard_counts(env: NeonEnv, tenant_ids): + counts: defaultdict[int, int] = defaultdict(int) + for tid in tenant_ids: + for shard in env.storage_controller.locate(tid): + counts[shard["node_id"]] += 1 + return counts + + +def test_storage_controller_smoke( + neon_env_builder: NeonEnvBuilder, +): + """ + Test the basic lifecycle of a storage controller: + - Restarting + - Restarting a pageserver + - Creating and deleting tenants and timelines + - Marking a pageserver offline + """ + + neon_env_builder.num_pageservers = 3 + env = neon_env_builder.init_configs() + + for pageserver in env.pageservers: + # This test detaches tenants during migration, which can race with deletion queue operations, + # during detach we only do an advisory flush, we don't wait for it. + pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"]) + + # Start services by hand so that we can skip a pageserver (this will start + register later) + env.broker.try_start() + env.storage_controller.start() + env.pageservers[0].start() + env.pageservers[1].start() + for sk in env.safekeepers: + sk.start() + + # The pageservers we started should have registered with the sharding service on startup + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + assert set(n["id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id} + + # Starting an additional pageserver should register successfully + env.pageservers[2].start() + nodes = env.storage_controller.node_list() + assert len(nodes) == 3 + assert set(n["id"] for n in nodes) == {ps.id for ps in env.pageservers} + + # Use a multiple of pageservers to get nice even number of shards on each one + tenant_shard_count = len(env.pageservers) * 4 + tenant_count = len(env.pageservers) * 2 + shards_per_tenant = tenant_shard_count // tenant_count + tenant_ids = set(TenantId.generate() for i in range(0, tenant_count)) + + # Creating several tenants should spread out across the pageservers + for tid in tenant_ids: + env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant) + + # Repeating a creation should be idempotent (we are just testing it doesn't return an error) + env.storage_controller.tenant_create( + tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant + ) + + for node_id, count in get_node_shard_counts(env, tenant_ids).items(): + # we used a multiple of pagservers for the total shard count, + # so expect equal number on all pageservers + assert count == tenant_shard_count / len( + env.pageservers + ), f"Node {node_id} has bad count {count}" + + # Creating and deleting timelines should work, using identical API to pageserver + timeline_crud_tenant = next(iter(tenant_ids)) + timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id + ) + timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant) + assert len(timelines) == 2 + assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines) + # virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id) + timeline_delete_wait_completed( + env.storage_controller.pageserver_api(), timeline_crud_tenant, timeline_id + ) + timelines = env.storage_controller.pageserver_api().timeline_list(timeline_crud_tenant) + assert len(timelines) == 1 + assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines) + + # Marking a pageserver offline should migrate tenants away from it. + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + def node_evacuated(node_id: int) -> None: + counts = get_node_shard_counts(env, tenant_ids) + assert counts[node_id] == 0 + + wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + + # Let all the reconciliations after marking the node offline complete + env.storage_controller.reconcile_until_idle() + + # Marking pageserver active should not migrate anything to it + # immediately + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Active"}) + time.sleep(1) + assert get_node_shard_counts(env, tenant_ids)[env.pageservers[0].id] == 0 + + # Restarting a pageserver should not detach any tenants (i.e. /re-attach works) + before_restart = env.pageservers[1].http_client().tenant_list_locations() + env.pageservers[1].stop() + env.pageservers[1].start() + after_restart = env.pageservers[1].http_client().tenant_list_locations() + assert len(after_restart) == len(before_restart) + + # Locations should be the same before & after restart, apart from generations + for _shard_id, tenant in after_restart["tenant_shards"]: + del tenant["generation"] + for _shard_id, tenant in before_restart["tenant_shards"]: + del tenant["generation"] + assert before_restart == after_restart + + # Delete all the tenants + for tid in tenant_ids: + tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10) + + env.storage_controller.consistency_check() + + # Set a scheduling policy on one node, create all the tenants, observe + # that the scheduling policy is respected. + env.storage_controller.node_configure(env.pageservers[1].id, {"scheduling": "Draining"}) + + # Create some fresh tenants + tenant_ids = set(TenantId.generate() for i in range(0, tenant_count)) + for tid in tenant_ids: + env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant) + + counts = get_node_shard_counts(env, tenant_ids) + # Nothing should have been scheduled on the node in Draining + assert counts[env.pageservers[1].id] == 0 + assert counts[env.pageservers[0].id] == tenant_shard_count // 2 + assert counts[env.pageservers[2].id] == tenant_shard_count // 2 + + env.storage_controller.consistency_check() + + +def test_node_status_after_restart( + neon_env_builder: NeonEnvBuilder, +): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + # Initially we have two online pageservers + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + env.pageservers[1].stop() + env.storage_controller.allowed_errors.extend([".*Could not scan node"]) + + env.storage_controller.stop() + env.storage_controller.start() + + def is_ready(): + assert env.storage_controller.ready() is True + + wait_until(30, 1, is_ready) + + # We loaded nodes from database on restart + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + # We should still be able to create a tenant, because the pageserver which is still online + # should have had its availabilty state set to Active. + env.storage_controller.tenant_create(TenantId.generate()) + + env.storage_controller.consistency_check() + + +def test_storage_controller_passthrough( + neon_env_builder: NeonEnvBuilder, +): + """ + For simple timeline/tenant GET APIs that don't require coordination across + shards, the sharding service implements a proxy to shard zero. This test + calls those APIs. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + # We will talk to storage controller as if it was a pageserver, using the pageserver + # HTTP client + client = PageserverHttpClient(env.storage_controller_port, lambda: True) + timelines = client.timeline_list(tenant_id=env.initial_tenant) + assert len(timelines) == 1 + + status = client.tenant_status(env.initial_tenant) + assert TenantId(status["id"]) == env.initial_tenant + assert set(TimelineId(t) for t in status["timelines"]) == { + env.initial_timeline, + } + assert status["state"]["slug"] == "Active" + + (synthetic_size, size_inputs) = client.tenant_size_and_modelinputs(env.initial_tenant) + assert synthetic_size > 0 + assert "segments" in size_inputs + + env.storage_controller.consistency_check() + + +def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + tenant_a = env.initial_tenant + tenant_b = TenantId.generate() + env.storage_controller.tenant_create(tenant_b) + env.pageserver.tenant_detach(tenant_a) + + # TODO: extend this test to use multiple pageservers, and check that locations don't move around + # on restart. + + # Storage controller restart + env.storage_controller.stop() + env.storage_controller.start() + + observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list()) + + # Tenant A should still be attached + assert tenant_a not in observed + + # Tenant B should remain detached + assert tenant_b in observed + + # Pageserver restart + env.pageserver.stop() + env.pageserver.start() + + # Same assertions as above: restarting either service should not perturb things + observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list()) + assert tenant_a not in observed + assert tenant_b in observed + + env.storage_controller.consistency_check() + + +@pytest.mark.parametrize("warm_up", [True, False]) +def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): + """ + We onboard tenants to the sharding service by treating it as a 'virtual pageserver' + which provides the /location_config API. This is similar to creating a tenant, + but imports the generation number. + """ + + # One pageserver to simulate legacy environment, two to be managed by storage controller + neon_env_builder.num_pageservers = 3 + + # Start services by hand so that we can skip registration on one of the pageservers + env = neon_env_builder.init_configs() + env.broker.try_start() + env.storage_controller.start() + + # This is the pageserver where we'll initially create the tenant. Run it in emergency + # mode so that it doesn't talk to storage controller, and do not register it. + env.pageservers[0].allowed_errors.append(".*Emergency mode!.*") + env.pageservers[0].patch_config_toml_nonrecursive( + { + "control_plane_emergency_mode": True, + } + ) + env.pageservers[0].start() + origin_ps = env.pageservers[0] + + # These are the pageservers managed by the sharding service, where the tenant + # will be attached after onboarding + env.pageservers[1].start() + env.pageservers[2].start() + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + for sk in env.safekeepers: + sk.start() + + # Create a tenant directly via pageserver HTTP API, skipping the storage controller + tenant_id = TenantId.generate() + generation = 123 + origin_ps.http_client().tenant_create(tenant_id, generation=generation) + + # As if doing a live migration, first configure origin into stale mode + r = origin_ps.http_client().tenant_location_conf( + tenant_id, + { + "mode": "AttachedStale", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + assert len(r["shards"]) == 1 + + if warm_up: + origin_ps.http_client().tenant_heatmap_upload(tenant_id) + + # We expect to be called via live migration code, which may try to configure the tenant into secondary + # mode before attaching it. + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + "generation": None, + }, + ) + + virtual_ps_http.tenant_secondary_download(tenant_id) + warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][ + "node_secondary" + ][0] + + # Call into storage controller to onboard the tenant + generation += 1 + r = virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedMulti", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + assert len(r["shards"]) == 1 + + describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0] + dest_ps_id = describe["node_attached"] + dest_ps = env.get_pageserver(dest_ps_id) + if warm_up: + # The storage controller should have attached the tenant to the same placce + # it had a secondary location, otherwise there was no point warming it up + assert dest_ps_id == warm_up_ps + + # It should have been given a new secondary location as well + assert len(describe["node_secondary"]) == 1 + assert describe["node_secondary"][0] != warm_up_ps + + # As if doing a live migration, detach the original pageserver + origin_ps.http_client().tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + # As if doing a live migration, call into the storage controller to + # set it to AttachedSingle: this is a no-op, but we test it because the + # cloud control plane may call this for symmetry with live migration to + # an individual pageserver + r = virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + assert len(r["shards"]) == 1 + + # We should see the tenant is now attached to the pageserver managed + # by the sharding service + origin_tenants = origin_ps.http_client().tenant_list() + assert len(origin_tenants) == 0 + dest_tenants = dest_ps.http_client().tenant_list() + assert len(dest_tenants) == 1 + assert TenantId(dest_tenants[0]["id"]) == tenant_id + + # sharding service advances generation by 1 when it first attaches. We started + # with a nonzero generation so this equality also proves that the generation + # was properly carried over during onboarding. + assert dest_tenants[0]["generation"] == generation + 1 + + # The onboarded tenant should survive a restart of sharding service + env.storage_controller.stop() + env.storage_controller.start() + + # The onboarded tenant should surviev a restart of pageserver + dest_ps.stop() + dest_ps.start() + + # Having onboarded via /location_config, we should also be able to update the + # TenantConf part of LocationConf, without inadvertently resetting the generation + modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100} + dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id) + + # The generation has moved on since we onboarded + assert generation != dest_tenant_before_conf_change["generation"] + + r = virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": modified_tenant_conf, + # This is intentionally a stale generation + "generation": generation, + }, + ) + assert len(r["shards"]) == 1 + dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id) + assert ( + dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"] + ) + dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id) + + # Storage controller auto-sets heatmap period, ignore it for the comparison + del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"] + assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf + + env.storage_controller.consistency_check() + + +def test_storage_controller_compute_hook( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, +): + """ + Test that the sharding service calls out to the configured HTTP endpoint on attachment changes + """ + + # We will run two pageserver to migrate and check that the storage controller sends notifications + # when migrating. + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + # Set up fake HTTP notify endpoint + notifications = [] + + handle_params = {"status": 200} + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_start() + + # We will to an unclean migration, which will result in deletion queue warnings + env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*") + + # Initial notification from tenant creation + assert len(notifications) == 1 + expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + } + assert notifications[0] == expect + + env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Offline"}) + + def node_evacuated(node_id: int) -> None: + counts = get_node_shard_counts(env, [env.initial_tenant]) + assert counts[node_id] == 0 + + wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + + # Additional notification from migration + log.info(f"notifications: {notifications}") + expect = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[1].id), "shard_number": 0}], + } + + def received_migration_notification(): + assert len(notifications) == 2 + assert notifications[1] == expect + + wait_until(20, 0.25, received_migration_notification) + + # When we restart, we should re-emit notifications for all tenants + env.storage_controller.stop() + env.storage_controller.start() + + def received_restart_notification(): + assert len(notifications) == 3 + assert notifications[2] == expect + + wait_until(10, 1, received_restart_notification) + + # Splitting a tenant should cause its stripe size to become visible in the compute notification + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2) + expect = { + "tenant_id": str(env.initial_tenant), + "stripe_size": 32768, + "shards": [ + {"node_id": int(env.pageservers[1].id), "shard_number": 0}, + {"node_id": int(env.pageservers[1].id), "shard_number": 1}, + ], + } + + def received_split_notification(): + assert len(notifications) == 4 + assert notifications[3] == expect + + wait_until(10, 1, received_split_notification) + + # If the compute hook is unavailable, that should not block creating a tenant and + # creating a timeline. This simulates a control plane refusing to accept notifications + handle_params["status"] = 423 + degraded_tenant_id = TenantId.generate() + degraded_timeline_id = TimelineId.generate() + env.storage_controller.tenant_create(degraded_tenant_id) + env.storage_controller.pageserver_api().timeline_create( + PgVersion.NOT_SET, degraded_tenant_id, degraded_timeline_id + ) + + # Ensure we hit the handler error path + env.storage_controller.allowed_errors.append( + ".*Failed to notify compute of attached pageserver.*tenant busy.*" + ) + env.storage_controller.allowed_errors.append(".*Reconcile error.*tenant busy.*") + assert notifications[-1] is not None + assert notifications[-1]["tenant_id"] == str(degraded_tenant_id) + + env.storage_controller.consistency_check() + + +def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): + """ + Verify that occasional-use debug APIs work as expected. This is a lightweight test + that just hits the endpoints to check that they don't bitrot. + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id, shard_count=2, shard_stripe_size=8192) + + # Check that the consistency check passes on a freshly setup system + env.storage_controller.consistency_check() + + # These APIs are intentionally not implemented as methods on NeonStorageController, as + # they're just for use in unanticipated circumstances. + + # Initial tenant (1 shard) and the one we just created (2 shards) should be visible + response = env.storage_controller.request( + "GET", + f"{env.storage_controller_api}/debug/v1/tenant", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + assert len(response.json()) == 3 + + # Scheduler should report the expected nodes and shard counts + response = env.storage_controller.request( + "GET", f"{env.storage_controller_api}/debug/v1/scheduler" + ) + # Two nodes, in a dict of node_id->node + assert len(response.json()["nodes"]) == 2 + assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3 + assert all(v["may_schedule"] for v in response.json()["nodes"].values()) + + response = env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + assert len(env.storage_controller.node_list()) == 1 + + response = env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + + # Tenant drop should be reflected in dump output + response = env.storage_controller.request( + "GET", + f"{env.storage_controller_api}/debug/v1/tenant", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + assert len(response.json()) == 1 + + # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're + # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind. + env.storage_controller.consistency_check() + + +def test_storage_controller_s3_time_travel_recovery( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + """ + Test for S3 time travel + """ + + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + # Mock S3 doesn't have versioning enabled by default, enable it + # (also do it before there is any writes to the bucket) + if remote_storage_kind == RemoteStorageKind.MOCK_S3: + remote_storage = neon_env_builder.pageserver_remote_storage + assert remote_storage, "remote storage not configured" + enable_remote_storage_versioning(remote_storage) + + neon_env_builder.num_pageservers = 1 + + env = neon_env_builder.init_start() + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, + shard_count=2, + shard_stripe_size=8192, + tenant_config=MANY_SMALL_LAYERS_TENANT_CONFIG, + ) + + # Check that the consistency check passes + env.storage_controller.consistency_check() + + branch_name = "main" + timeline_id = env.neon_cli.create_timeline( + branch_name, + tenant_id=tenant_id, + ) + # Write some nontrivial amount of data into the endpoint and wait until it is uploaded + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + # last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) + + # Give the data time to be uploaded + time.sleep(4) + + # Detach the tenant + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + time.sleep(4) + ts_before_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + # Simulate a "disaster": delete some random files from remote storage for one of the shards + assert env.pageserver_remote_storage + shard_id_for_list = "0002" + objects: List[ObjectTypeDef] = list_prefix( + env.pageserver_remote_storage, + f"tenants/{tenant_id}-{shard_id_for_list}/timelines/{timeline_id}/", + ).get("Contents", []) + assert len(objects) > 1 + log.info(f"Found {len(objects)} objects in remote storage") + should_delete = False + for obj in objects: + obj_key = obj["Key"] + should_delete = not should_delete + if not should_delete: + log.info(f"Keeping key on remote storage: {obj_key}") + continue + log.info(f"Deleting key from remote storage: {obj_key}") + remote_storage_delete_key(env.pageserver_remote_storage, obj_key) + pass + + time.sleep(4) + ts_after_disaster = datetime.now(tz=timezone.utc).replace(tzinfo=None) + time.sleep(4) + + # Do time travel recovery + virtual_ps_http.tenant_time_travel_remote_storage( + tenant_id, ts_before_disaster, ts_after_disaster, shard_counts=[2] + ) + time.sleep(4) + + # Attach the tenant again + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": 100, + }, + ) + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + endpoint.safe_psql("SELECT * FROM created_foo;") + + env.storage_controller.consistency_check() + + +def test_storage_controller_auth(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + env = neon_env_builder.init_start() + svc = env.storage_controller + api = env.storage_controller_api + + tenant_id = TenantId.generate() + body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)} + + env.storage_controller.allowed_errors.append(".*Unauthorized.*") + env.storage_controller.allowed_errors.append(".*Forbidden.*") + + # No token + with pytest.raises( + StorageControllerApiException, + match="Unauthorized: missing authorization header", + ): + svc.request("POST", f"{env.storage_controller_api}/v1/tenant", json=body) + + # Token with incorrect scope + with pytest.raises( + StorageControllerApiException, + match="Forbidden: JWT authentication error", + ): + svc.request( + "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA) + ) + + # Token with correct scope + svc.request( + "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API) + ) + + # Token with admin scope should also be permitted + svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + + # No token + with pytest.raises( + StorageControllerApiException, + match="Unauthorized: missing authorization header", + ): + svc.request("GET", f"{api}/debug/v1/tenant") + + # Token with incorrect scope + with pytest.raises( + StorageControllerApiException, + match="Forbidden: JWT authentication error", + ): + svc.request( + "GET", f"{api}/debug/v1/tenant", headers=svc.headers(TokenScope.GENERATIONS_API) + ) + + # No token + with pytest.raises( + StorageControllerApiException, + match="Unauthorized: missing authorization header", + ): + svc.request("POST", f"{api}/upcall/v1/re-attach") + + # Token with incorrect scope + with pytest.raises( + StorageControllerApiException, + match="Forbidden: JWT authentication error", + ): + svc.request( + "POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API) + ) + + +def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder): + """ + Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without + supplying the whole LocationConf. + """ + + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + + http = env.storage_controller.pageserver_api() + + default_value = "7days" + new_value = "1h" + http.set_tenant_config(tenant_id, {"pitr_interval": new_value}) + + # Ensure the change landed on the storage controller + readback_controller = http.tenant_config(tenant_id) + assert readback_controller.effective_config["pitr_interval"] == new_value + assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value + + # Ensure the change made it down to the pageserver + readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id) + assert readback_ps.effective_config["pitr_interval"] == new_value + assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value + + # Omitting a value clears it. This looks different in storage controller + # vs. pageserver API calls, because pageserver has defaults. + http.set_tenant_config(tenant_id, {}) + readback_controller = http.tenant_config(tenant_id) + assert readback_controller.effective_config["pitr_interval"] is None + assert readback_controller.tenant_specific_overrides["pitr_interval"] is None + readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id) + assert readback_ps.effective_config["pitr_interval"] == default_value + assert "pitr_interval" not in readback_ps.tenant_specific_overrides + + env.storage_controller.consistency_check() + + +def test_storage_controller_tenant_deletion( + neon_env_builder: NeonEnvBuilder, + compute_reconfigure_listener: ComputeReconfigure, +): + """ + Validate that: + - Deleting a tenant deletes all its shards + - Deletion does not require the compute notification hook to be responsive + - Deleting a tenant also removes all secondary locations + """ + neon_env_builder.num_pageservers = 4 + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.control_plane_compute_hook_api = ( + compute_reconfigure_listener.control_plane_compute_hook_api + ) + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant( + tenant_id, timeline_id, shard_count=2, placement_policy='{"Attached":1}' + ) + + # Ensure all the locations are configured, including secondaries + env.storage_controller.reconcile_until_idle() + + shard_ids = [ + TenantShardId.parse(shard["shard_id"]) for shard in env.storage_controller.locate(tenant_id) + ] + + # Assert attachments all have local content + for shard_id in shard_ids: + pageserver = env.get_tenant_pageserver(shard_id) + assert pageserver.tenant_dir(shard_id).exists() + + # Assert all shards have some content in remote storage + for shard_id in shard_ids: + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(shard_id), + ) + ), + ) + + # Break the compute hook: we are checking that deletion does not depend on the compute hook being available + def break_hook(): + raise RuntimeError("Unexpected call to compute hook") + + compute_reconfigure_listener.register_on_notify(break_hook) + + # No retry loop: deletion should complete in one shot without polling for 202 responses, because + # it cleanly detaches all the shards first, and then deletes them in remote storage + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + + # Assert no pageservers have any local content + for pageserver in env.pageservers: + for shard_id in shard_ids: + assert not pageserver.tenant_dir(shard_id).exists() + + for shard_id in shard_ids: + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(shard_id), + ) + ), + ) + + # Assert the tenant is not visible in storage controller API + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_describe(tenant_id) + + +class Failure: + pageserver_id: int + + def apply(self, env: NeonEnv): + raise NotImplementedError() + + def clear(self, env: NeonEnv): + raise NotImplementedError() + + def nodes(self): + raise NotImplementedError() + + +class NodeStop(Failure): + def __init__(self, pageserver_ids, immediate): + self.pageserver_ids = pageserver_ids + self.immediate = immediate + + def apply(self, env: NeonEnv): + for ps_id in self.pageserver_ids: + pageserver = env.get_pageserver(ps_id) + pageserver.stop(immediate=self.immediate) + + def clear(self, env: NeonEnv): + for ps_id in self.pageserver_ids: + pageserver = env.get_pageserver(ps_id) + pageserver.start() + + def nodes(self): + return self.pageserver_ids + + +class PageserverFailpoint(Failure): + def __init__(self, failpoint, pageserver_id): + self.failpoint = failpoint + self.pageserver_id = pageserver_id + + def apply(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "return(1)")) + + def clear(self, env: NeonEnv): + pageserver = env.get_pageserver(self.pageserver_id) + pageserver.http_client().configure_failpoints((self.failpoint, "off")) + + def nodes(self): + return [self.pageserver_id] + + +def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: + tenants = env.storage_controller.tenant_list() + + node_to_tenants: dict[int, list[TenantId]] = {} + for t in tenants: + for node_id, loc_state in t["observed"]["locations"].items(): + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] == "AttachedSingle" + ): + crnt = node_to_tenants.get(int(node_id), []) + crnt.append(TenantId(t["tenant_shard_id"])) + node_to_tenants[int(node_id)] = crnt + + return node_to_tenants + + +@pytest.mark.parametrize( + "failure", + [ + NodeStop(pageserver_ids=[1], immediate=False), + NodeStop(pageserver_ids=[1], immediate=True), + NodeStop(pageserver_ids=[1, 2], immediate=True), + PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"), + ], +) +def test_storage_controller_heartbeats( + neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure +): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # Default log allow list permits connection errors, but this test will use error responses on + # the utilization endpoint. + env.storage_controller.allowed_errors.append( + ".*Call to node.*management API.*failed.*failpoint.*" + ) + + # Initially we have two online pageservers + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + assert all([n["availability"] == "Active" for n in nodes]) + + # ... then we create two tenants and write some data into them + def create_tenant(tid: TenantId): + env.storage_controller.tenant_create(tid) + + branch_name = "main" + env.neon_cli.create_timeline( + branch_name, + tenant_id=tid, + ) + + with env.endpoints.create_start("main", tenant_id=tid) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + endpoint.safe_psql("CREATE TABLE created_foo(id integer);") + + tenant_ids = [TenantId.generate(), TenantId.generate()] + for tid in tenant_ids: + create_tenant(tid) + + # ... expecting that each tenant will be placed on a different node + def tenants_placed(): + node_to_tenants = build_node_to_tenants_map(env) + log.info(f"{node_to_tenants=}") + + # Check that all the tenants have been attached + assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids) + # Check that each node got one tenant + assert all((len(ts) == 1 for ts in node_to_tenants.values())) + + wait_until(10, 1, tenants_placed) + + # ... then we apply the failure + offline_node_ids = set(failure.nodes()) + online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids + + for node_id in offline_node_ids: + env.get_pageserver(node_id).allowed_errors.append( + # In the case of the failpoint failure, the impacted pageserver + # still believes it has the tenant attached since location + # config calls into it will fail due to being marked offline. + ".*Dropped remote consistent LSN updates.*", + ) + + if len(offline_node_ids) > 1: + env.get_pageserver(node_id).allowed_errors.append( + ".*Scheduling error when marking pageserver.*offline.*", + ) + + failure.apply(env) + + # ... expecting the heartbeats to mark it offline + def nodes_offline(): + nodes = env.storage_controller.node_list() + log.info(f"{nodes=}") + for node in nodes: + if node["id"] in offline_node_ids: + assert node["availability"] == "Offline" + + # A node is considered offline if the last successful heartbeat + # was more than 10 seconds ago (hardcoded in the storage controller). + wait_until(20, 1, nodes_offline) + + # .. expecting the tenant on the offline node to be migrated + def tenant_migrated(): + if len(online_node_ids) == 0: + time.sleep(5) + return + + node_to_tenants = build_node_to_tenants_map(env) + log.info(f"{node_to_tenants=}") + + observed_tenants = set() + for node_id in online_node_ids: + observed_tenants |= set(node_to_tenants[node_id]) + + assert observed_tenants == set(tenant_ids) + + wait_until(10, 1, tenant_migrated) + + # ... then we clear the failure + failure.clear(env) + + # ... expecting the offline node to become active again + def nodes_online(): + nodes = env.storage_controller.node_list() + for node in nodes: + if node["id"] in online_node_ids: + assert node["availability"] == "Active" + + wait_until(10, 1, nodes_online) + + time.sleep(5) + + node_to_tenants = build_node_to_tenants_map(env) + log.info(f"Back online: {node_to_tenants=}") + + # ... expecting the storage controller to reach a consistent state + def storage_controller_consistent(): + env.storage_controller.consistency_check() + + wait_until(30, 1, storage_controller_consistent) + + +def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): + """ + Exercise the behavior of the /re-attach endpoint on pageserver startup when + pageservers have a mixture of attached and secondary locations + """ + + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + # We'll have two tenants. + tenant_a = TenantId.generate() + env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}') + tenant_b = TenantId.generate() + env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}') + + # Each pageserver will have one attached and one secondary location + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_a, 0, 0), env.pageservers[0].id + ) + env.storage_controller.tenant_shard_migrate( + TenantShardId(tenant_b, 0, 0), env.pageservers[1].id + ) + + # Hard-fail a pageserver + victim_ps = env.pageservers[1] + survivor_ps = env.pageservers[0] + victim_ps.stop(immediate=True) + + # Heatbeater will notice it's offline, and consequently attachments move to the other pageserver + def failed_over(): + locations = survivor_ps.http_client().tenant_list_locations()["tenant_shards"] + log.info(f"locations: {locations}") + assert len(locations) == 2 + assert all(loc[1]["mode"] == "AttachedSingle" for loc in locations) + + # We could pre-empty this by configuring the node to Offline, but it's preferable to test + # the realistic path we would take when a node restarts uncleanly. + # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local + wait_until(30, 1, failed_over) + + reconciles_before_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + + # Restart the failed pageserver + victim_ps.start() + + # We expect that the re-attach call correctly tipped off the pageserver that its locations + # are all secondaries now. + locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(locations) == 2 + assert all(loc[1]["mode"] == "Secondary" for loc in locations) + + # We expect that this situation resulted from the re_attach call, and not any explicit + # Reconciler runs: assert that the reconciliation count has not gone up since we restarted. + reconciles_after_restart = env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + assert reconciles_after_restart == reconciles_before_restart + + +def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder): + """ + Check that emergency hooks for disabling rogue tenants' reconcilers work as expected. + """ + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + + env.storage_controller.allowed_errors.extend( + [ + # We will intentionally cause reconcile errors + ".*Reconcile error.*", + # Message from using a scheduling policy + ".*Scheduling is disabled by policy.*", + ".*Skipping reconcile for policy.*", + # Message from a node being offline + ".*Call to node .* management API .* failed", + ] + ) + + # Stop pageserver so that reconcile cannot complete + env.pageserver.stop() + + env.storage_controller.tenant_create(tenant_id, placement_policy="Detached") + + # Try attaching it: we should see reconciles failing + env.storage_controller.tenant_policy_update( + tenant_id, + { + "placement": {"Attached": 0}, + }, + ) + + def reconcile_errors() -> int: + return int( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "error"} + ) + or 0 + ) + + def reconcile_ok() -> int: + return int( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_complete_total", filter={"status": "ok"} + ) + or 0 + ) + + def assert_errors_gt(n) -> int: + e = reconcile_errors() + assert e > n + return e + + errs = wait_until(10, 1, lambda: assert_errors_gt(0)) + + # Try reconciling again, it should fail again + with pytest.raises(StorageControllerApiException): + env.storage_controller.reconcile_all() + errs = wait_until(10, 1, lambda: assert_errors_gt(errs)) + + # Configure the tenant to disable reconciles + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Stop", + }, + ) + + # Try reconciling again, it should not cause an error (silently skip) + env.storage_controller.reconcile_all() + assert reconcile_errors() == errs + + # Start the pageserver and re-enable reconciles + env.pageserver.start() + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Active", + }, + ) + + def assert_ok_gt(n) -> int: + o = reconcile_ok() + assert o > n + return o + + # We should see a successful reconciliation + wait_until(10, 1, lambda: assert_ok_gt(0)) + + # And indeed the tenant should be attached + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + + +def test_storcon_cli(neon_env_builder: NeonEnvBuilder): + """ + The storage controller command line interface (storcon-cli) is an internal tool. Most tests + just use the APIs directly: this test exercises some basics of the CLI as a regression test + that the client remains usable as the server evolves. + """ + output_dir = neon_env_builder.test_output_dir + shard_count = 4 + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api] + + def storcon_cli(args): + """ + CLI wrapper: returns stdout split into a list of non-empty strings + """ + (output_path, stdout, status_code) = subprocess_capture( + output_dir, + [str(s) for s in base_args + args], + echo_stderr=True, + echo_stdout=True, + env={}, + check=False, + capture_stdout=True, + timeout=10, + ) + if status_code: + log.warning(f"Command {args} failed") + log.warning(f"Output at: {output_path}") + + raise RuntimeError("CLI failure (check logs for stderr)") + + assert stdout is not None + return [line.strip() for line in stdout.split("\n") if line.strip()] + + # List nodes + node_lines = storcon_cli(["nodes"]) + # Table header, footer, and one line of data + assert len(node_lines) == 5 + assert "localhost" in node_lines[3] + + # Pause scheduling onto a node + storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"]) + assert "Pause" in storcon_cli(["nodes"])[3] + + # We will simulate a node death and then marking it offline + env.pageservers[0].stop(immediate=True) + # Sleep to make it unlikely that the controller's heartbeater will race handling + # a /utilization response internally, such that it marks the node back online. IRL + # there would always be a longer delay than this before a node failing and a human + # intervening. + time.sleep(2) + + storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"]) + assert "Offline" in storcon_cli(["nodes"])[3] + + # List tenants + tenant_lines = storcon_cli(["tenants"]) + assert len(tenant_lines) == 5 + assert str(env.initial_tenant) in tenant_lines[3] + + # Setting scheduling policies intentionally result in warnings, they're for rare use. + env.storage_controller.allowed_errors.extend( + [".*Skipping reconcile for policy.*", ".*Scheduling is disabled by policy.*"] + ) + + # Describe a tenant + tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) + assert len(tenant_lines) == 3 + shard_count * 2 + assert str(env.initial_tenant) in tenant_lines[3] + + # Pause changes on a tenant + storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) + assert "Stop" in storcon_cli(["tenants"])[3] + + # Change a tenant's placement + storcon_cli( + ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] + ) + assert "Secondary" in storcon_cli(["tenants"])[3] + + # Modify a tenant's config + storcon_cli( + [ + "tenant-config", + "--tenant-id", + str(env.initial_tenant), + "--config", + json.dumps({"pitr_interval": "1m"}), + ] + ) + + # Quiesce any background reconciliation before doing consistency check + env.storage_controller.reconcile_until_idle(timeout_secs=10) + env.storage_controller.consistency_check() + + +def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder): + """ + Check that when lock on resource (tenants, nodes) is held for too long it is + traced in logs. + """ + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + env.storage_controller.allowed_errors.extend( + [ + ".*Lock on.*", + ".*Scheduling is disabled by policy.*", + f".*Operation TimelineCreate on key {tenant_id} has waited.*", + ] + ) + + # Apply failpoint + env.storage_controller.configure_failpoints( + ("tenant-update-policy-exclusive-lock", "return(35000)") + ) + + # This will hold the exclusive for enough time to cause an warning + def update_tenent_policy(): + env.storage_controller.tenant_policy_update( + tenant_id=tenant_id, + body={ + "scheduling": "Stop", + }, + ) + + thread_update_tenant_policy = threading.Thread(target=update_tenent_policy) + thread_update_tenant_policy.start() + + # Make sure the update policy thread has started + time.sleep(1) + # This will not be able to access and will log a warning + timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + thread_update_tenant_policy.join() + + env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for") + env.storage_controller.assert_log_contains( + f"Operation TimelineCreate on key {tenant_id} has waited" + ) + + +@pytest.mark.parametrize("remote_storage", [RemoteStorageKind.LOCAL_FS, s3_storage()]) +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_storage): + """ + Tenant import is a support/debug tool for recovering a tenant from remote storage + if we don't have any metadata for it in the storage controller. + """ + + # This test is parametrized on remote storage because it exercises the relatively rare + # code path of listing with a prefix that is not a directory name: this helps us notice + # quickly if local_fs or s3_bucket implementations diverge. + neon_env_builder.enable_pageserver_remote_storage(remote_storage) + + # Use multiple pageservers because some test helpers assume single sharded tenants + # if there is only one pageserver. + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + tenant_id = env.initial_tenant + + # Create a second timeline to ensure that import finds both + timeline_a = env.initial_timeline + timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + + workload_a = Workload(env, tenant_id, timeline_a, branch_name="main") + workload_a.init() + + workload_b = Workload(env, tenant_id, timeline_b, branch_name="branch_b") + workload_b.init() + + # Write some data + workload_a.write_rows(72) + expect_rows_a = workload_a.expect_rows + workload_a.stop() + del workload_a + + # Bump generation to make sure generation recovery works properly + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + # Write some data in the higher generation into the other branch + workload_b.write_rows(107) + expect_rows_b = workload_b.expect_rows + workload_b.stop() + del workload_b + + # Detach from pageservers + env.storage_controller.tenant_policy_update( + tenant_id, + { + "placement": "Detached", + }, + ) + env.storage_controller.reconcile_until_idle(timeout_secs=10) + + # Force-drop it from the storage controller + env.storage_controller.request( + "POST", + f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + + # Now import it again + env.neon_cli.import_tenant(tenant_id) + + # Check we found the shards + describe = env.storage_controller.tenant_describe(tenant_id) + literal_shard_count = 1 if shard_count is None else shard_count + assert len(describe["shards"]) == literal_shard_count + + # Check the data is still there: this implicitly proves that we recovered generation numbers + # properly, for the timeline which was written to after a generation bump. + for timeline, branch, expect_rows in [ + (timeline_a, "main", expect_rows_a), + (timeline_b, "branch_1", expect_rows_b), + ]: + workload = Workload(env, tenant_id, timeline, branch_name=branch) + workload.expect_rows = expect_rows + workload.validate() + + +def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): + """ + Graceful reststart of storage controller clusters use the drain and + fill hooks in order to migrate attachments away from pageservers before + restarting. In practice, Ansible will drive this process. + """ + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_count = 5 + shard_count_per_tenant = 8 + total_shards = tenant_count * shard_count_per_tenant + tenant_ids = [] + + for _ in range(0, tenant_count): + tid = TenantId.generate() + tenant_ids.append(tid) + env.neon_cli.create_tenant( + tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant + ) + + # Give things a chance to settle. + # A call to `reconcile_until_idle` could be used here instead, + # however since all attachments are placed on the same node, + # we'd have to wait for a long time (2 minutes-ish) for optimizations + # to quiesce. + # TODO: once the initial attachment selection is fixed, update this + # to use `reconcile_until_idle`. + time.sleep(2) + + nodes = env.storage_controller.node_list() + assert len(nodes) == 2 + + def retryable_node_operation(op, ps_id, max_attempts, backoff): + while max_attempts > 0: + try: + op(ps_id) + return + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Operation failed ({max_attempts} attempts left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + + def poll_node_status(node_id, desired_scheduling_policy, max_attempts, backoff): + log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy") + while max_attempts > 0: + try: + status = env.storage_controller.node_status(node_id) + policy = status["scheduling"] + if policy == desired_scheduling_policy: + return + else: + max_attempts -= 1 + log.info(f"Status call returned {policy=} ({max_attempts} attempts left)") + + if max_attempts == 0: + raise AssertionError( + f"Status for {node_id=} did not reach {desired_scheduling_policy=}" + ) + + time.sleep(backoff) + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Status call failed ({max_attempts} retries left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + + def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards): + # Assert that all nodes have some attached shards + assert len(shard_counts) == len(env.pageservers) + + min_shard_count = min(shard_counts.values()) + max_shard_count = max(shard_counts.values()) + + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + + # Perform a graceful rolling restart + for ps in env.pageservers: + retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 + ) + poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5) + + shard_counts = get_node_shard_counts(env, tenant_ids) + log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") + # Assert that we've drained the node + assert shard_counts[ps.id] == 0 + # Assert that those shards actually went somewhere + assert sum(shard_counts.values()) == total_shards + + ps.restart() + poll_node_status(ps.id, "Active", max_attempts=10, backoff=1) + + retryable_node_operation( + lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 + ) + poll_node_status(ps.id, "Active", max_attempts=6, backoff=5) + + shard_counts = get_node_shard_counts(env, tenant_ids) + log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") + assert_shard_counts_balanced(env, shard_counts, total_shards) + + # Now check that shards are reasonably balanced + shard_counts = get_node_shard_counts(env, tenant_ids) + log.info(f"Shard counts after rolling restart: {shard_counts}") + assert_shard_counts_balanced(env, shard_counts, total_shards) diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py new file mode 100644 index 0000000000..35ae61c380 --- /dev/null +++ b/test_runner/regress/test_storage_scrubber.py @@ -0,0 +1,160 @@ +import os +import shutil +from typing import Optional + +import pytest +from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + StorageScrubber, +) +from fixtures.remote_storage import S3Storage, s3_storage +from fixtures.workload import Workload + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + """ + Test the `tenant-snapshot` subcommand, which grabs data from remote storage + + This is only a support/debug tool, but worth testing to ensure the tool does not regress. + """ + + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = shard_count if shard_count is not None else 1 + + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + branch = "main" + + # Do some work + workload = Workload(env, tenant_id, timeline_id, branch) + workload.init() + + # Multiple write/flush passes to generate multiple layers + for _n in range(0, 3): + workload.write_rows(128) + + # Do some more work after a restart, so that we have multiple generations + for pageserver in env.pageservers: + pageserver.stop() + pageserver.start() + + for _n in range(0, 3): + workload.write_rows(128) + + # If we're doing multiple shards, split: this is important to exercise + # the scrubber's ability to understand the references from child shards to parent shard's layers + if shard_count is not None: + tenant_shard_ids = env.storage_controller.tenant_shard_split( + tenant_id, shard_count=shard_count + ) + + # Write after shard split: this will result in shards containing a mixture of owned + # and parent layers in their index. + workload.write_rows(128) + else: + tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)] + + output_path = neon_env_builder.test_output_dir / "snapshot" + os.makedirs(output_path) + + scrubber = StorageScrubber(neon_env_builder) + scrubber.tenant_snapshot(tenant_id, output_path) + + assert len(os.listdir(output_path)) > 0 + + workload.stop() + + # Stop pageservers + for pageserver in env.pageservers: + pageserver.stop() + + # Drop all shards' local storage + for tenant_shard_id in tenant_shard_ids: + pageserver = env.get_tenant_pageserver(tenant_shard_id) + shutil.rmtree(pageserver.timeline_dir(tenant_shard_id, timeline_id)) + + # Replace remote storage contents with the snapshot we downloaded + assert isinstance(env.pageserver_remote_storage, S3Storage) + + remote_tenant_path = env.pageserver_remote_storage.tenant_path(tenant_id) + + # Delete current remote storage contents + bucket = env.pageserver_remote_storage.bucket_name + remote_client = env.pageserver_remote_storage.client + deleted = 0 + for object in remote_client.list_objects_v2(Bucket=bucket, Prefix=remote_tenant_path)[ + "Contents" + ]: + key = object["Key"] + remote_client.delete_object(Key=key, Bucket=bucket) + deleted += 1 + assert deleted > 0 + + # Upload from snapshot + for root, _dirs, files in os.walk(output_path): + for file in files: + full_local_path = os.path.join(root, file) + full_remote_path = ( + env.pageserver_remote_storage.tenants_path() + + "/" + + full_local_path.removeprefix(f"{output_path}/") + ) + remote_client.upload_file(full_local_path, bucket, full_remote_path) + + for pageserver in env.pageservers: + pageserver.start() + + # Check we can read everything + workload.validate() + + +@pytest.mark.parametrize("shard_count", [None, 4]) +def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]): + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=shard_count) + + workload = Workload(env, tenant_id, timeline_id) + workload.init() + + # We will end up with an index per shard, per cycle, plus one for the initial startup + n_cycles = 4 + expect_indices_per_shard = n_cycles + 1 + shard_count = 1 if shard_count is None else shard_count + + # For each cycle, detach and attach the tenant to bump the generation, and do some writes to generate uploads + for _i in range(0, n_cycles): + env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) + env.storage_controller.reconcile_until_idle() + + env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + + # This write includes remote upload, will generate an index in this generation + workload.write_rows(1) + + # With a high min_age, the scrubber should decline to delete anything + gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + + # If targeting a different tenant, the scrubber shouldn't do anything + gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc( + min_age_secs=1, tenant_ids=[TenantId.generate()] + ) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + + # With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations + gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py new file mode 100644 index 0000000000..d7f3962620 --- /dev/null +++ b/test_runner/regress/test_subscriber_restart.py @@ -0,0 +1,57 @@ +import threading +import time + +from fixtures.neon_fixtures import NeonEnv +from fixtures.utils import wait_until + + +# This test checks of logical replication subscriber is able to correctly restart replication without receiving duplicates. +# It requires tracking information about replication origins at page server side +def test_subscriber_restart(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("publisher") + pub = env.endpoints.create("publisher") + pub.start() + + env.neon_cli.create_branch("subscriber") + sub = env.endpoints.create("subscriber") + sub.start() + + n_records = 100000 + n_restarts = 100 + + def check_that_changes_propagated(): + scur.execute("SELECT count(*) FROM t") + res = scur.fetchall() + assert res[0][0] == n_records + + def insert_data(pub): + with pub.cursor() as pcur: + for i in range(0, n_records): + pcur.execute("INSERT into t values (%s,random()*100000)", (i,)) + + with pub.cursor() as pcur: + with sub.cursor() as scur: + pcur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + pcur.execute("CREATE PUBLICATION pub FOR TABLE t") + scur.execute("CREATE TABLE t (pk integer primary key, sk integer)") + # scur.execute("CREATE INDEX on t(sk)") # slowdown applying WAL at replica + pub_conn = f"host=localhost port={pub.pg_port} dbname=postgres user=cloud_admin" + query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub" + scur.execute(query) + time.sleep(2) # let initial table sync complete + + thread = threading.Thread(target=insert_data, args=(pub,), daemon=True) + thread.start() + + for _ in range(n_restarts): + # restart subscriber + # time.sleep(2) + sub.stop("immediate") + sub.start() + + thread.join() + pcur.execute(f"INSERT into t values ({n_records}, 0)") + n_records += 1 + with sub.cursor() as scur: + wait_until(10, 0.5, check_that_changes_propagated) diff --git a/test_runner/regress/test_subxacts.py b/test_runner/regress/test_subxacts.py index eb96a8faa4..10cb00c780 100644 --- a/test_runner/regress/test_subxacts.py +++ b/test_runner/regress/test_subxacts.py @@ -1,4 +1,3 @@ -from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content @@ -13,15 +12,10 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir): env.neon_cli.create_branch("test_subxacts", "empty") endpoint = env.endpoints.create_start("test_subxacts") - log.info("postgres is running on 'test_subxacts' branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() - cur.execute( - """ - CREATE TABLE t1(i int, j int); - """ - ) + cur.execute("CREATE TABLE t1(i int, j int);") cur.execute("select pg_switch_wal();") diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 2ed22cabc4..2cbb036c0d 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -1,29 +1,36 @@ import json from contextlib import closing +from typing import Any, Dict import psycopg2.extras +from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind -from fixtures.types import Lsn from fixtures.utils import wait_until def test_tenant_config(neon_env_builder: NeonEnvBuilder): """Test per tenant configuration""" - # set some non-default global config - neon_env_builder.pageserver_config_override = """ -page_cache_size=444; -wait_lsn_timeout='111 s'; -[tenant_config] -checkpoint_distance = 10000 -compaction_target_size = 1048576 -evictions_low_residence_duration_metric_threshold = "2 days" -eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "23 hours" } -""" + + def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]): + ps_cfg["page_cache_size"] = 444 + ps_cfg["wait_lsn_timeout"] = "111 s" + + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["checkpoint_distance"] = 10000 + tenant_config["compaction_target_size"] = 1048576 + tenant_config["evictions_low_residence_duration_metric_threshold"] = "2 days" + tenant_config["eviction_policy"] = { + "kind": "LayerAccessThreshold", + "period": "20s", + "threshold": "23 hours", + } + + neon_env_builder.pageserver_config_override = set_some_nondefault_global_config env = neon_env_builder.init_start() # we configure eviction but no remote storage, there might be error lines @@ -270,7 +277,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold = "period": "20s", "threshold": "23h", } - assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024 + assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024 # restart the pageserver and ensure that the config is still correct env.pageserver.stop() @@ -299,8 +306,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): # tenant is created with defaults, as in without config file (tenant_id, timeline_id) = env.neon_cli.create_tenant() - config_path = env.pageserver.tenant_dir(tenant_id) / "config" - assert config_path.exists(), "config file is always initially created" + config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1" http_client = env.pageserver.http_client() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index fece876459..fd3cc45c3f 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -5,10 +5,12 @@ import shutil from threading import Thread import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, + StorageScrubber, last_flush_lsn_upload, wait_for_last_flush_lsn, ) @@ -19,13 +21,14 @@ from fixtures.pageserver.utils import ( assert_prefix_not_empty, poll_for_remote_storage_iterations, tenant_delete_wait_completed, + wait_for_upload, wait_tenant_status_404, wait_until_tenant_active, wait_until_tenant_state, ) from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage -from fixtures.types import TenantId from fixtures.utils import run_pg_bench_small, wait_until +from requests.exceptions import ReadTimeout def test_tenant_delete_smoke( @@ -51,9 +54,26 @@ def test_tenant_delete_smoke( # first try to delete non existing tenant tenant_id = TenantId.generate() - env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") - with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"): - ps_http.tenant_delete(tenant_id=tenant_id) + env.pageserver.allowed_errors.append(".*NotFound.*") + env.pageserver.allowed_errors.append(".*simulated failure.*") + + # Check that deleting a non-existent tenant gives the expected result: this is a loop because we + # may need to retry on some remote storage errors injected by the test harness + while True: + try: + ps_http.tenant_delete(tenant_id=tenant_id) + except PageserverApiException as e: + if e.status_code == 500: + # This test uses failure injection, which can produce 500s as the pageserver expects + # the object store to always be available, and the ListObjects during deletion is generally + # an infallible operation + assert "simulated failure of remote operation" in e.message + elif e.status_code == 404: + # This is our expected result: trying to erase a non-existent tenant gives us 404 + assert "NotFound" in e.message + break + else: + raise env.neon_cli.create_tenant( tenant_id=tenant_id, @@ -61,7 +81,7 @@ def test_tenant_delete_smoke( ) # Default tenant and the one we created - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 # create two timelines one being the parent of another parent = None @@ -74,7 +94,7 @@ def test_tenant_delete_smoke( wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id) assert_prefix_not_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -85,17 +105,20 @@ def test_tenant_delete_smoke( parent = timeline + # Upload a heatmap so that we exercise deletion of that too + ps_http.tenant_heatmap_upload(tenant_id) + iterations = poll_for_remote_storage_iterations(remote_storage_kind) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2 tenant_delete_wait_completed(ps_http, tenant_id, iterations) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 tenant_path = env.pageserver.tenant_dir(tenant_id) assert not tenant_path.exists() assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -105,7 +128,7 @@ def test_tenant_delete_smoke( ) # Deletion updates the tenant count: the one default tenant remains - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 class Check(enum.Enum): @@ -127,7 +150,6 @@ FAILPOINTS = [ "timeline-delete-before-index-deleted-at", "timeline-delete-before-rm", "timeline-delete-before-index-delete", - "timeline-delete-after-rm-dir", ] FAILPOINTS_BEFORE_BACKGROUND = [ @@ -182,12 +204,14 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # We may leave some upload tasks in the queue. They're likely deletes. # For uploads we explicitly wait with `last_flush_lsn_upload` below. # So by ignoring these instead of waiting for empty upload queue # we execute more distinct code paths. '.*stopping left-over name="remote upload".*', + # an on-demand is cancelled by shutdown + ".*initial size calculation failed: downloading failed, possibly for shutdown", ] ) @@ -206,7 +230,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) assert_prefix_not_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -267,7 +291,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( # Check remote is empty assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -303,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach( # sanity check, data should be there assert_prefix_not_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -323,7 +347,7 @@ def test_tenant_delete_is_resumed_on_attach( # From deletion polling f".*NotFound: tenant {env.initial_tenant}.*", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*', @@ -342,7 +366,7 @@ def test_tenant_delete_is_resumed_on_attach( ) assert_prefix_not_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -377,7 +401,7 @@ def test_tenant_delete_is_resumed_on_attach( ps_http.deletion_queue_flush(execute=True) assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -410,9 +434,7 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE pageserver_http.configure_failpoints((failpoint, "pause")) def hit_pausable_failpoint_and_later_fail(): - with pytest.raises( - PageserverApiException, match="new timeline \\S+ has invalid disk_consistent_lsn" - ): + with pytest.raises(PageserverApiException, match="NotFound: tenant"): pageserver_http.timeline_create( env.pg_version, env.initial_tenant, env.initial_timeline ) @@ -442,8 +464,8 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE try: wait_until(10, 1, has_hit_failpoint) - # it should start ok, sync up with the stuck creation, then fail because disk_consistent_lsn was not updated - # then deletion should fail and set the tenant broken + # it should start ok, sync up with the stuck creation, then hang waiting for the timeline + # to shut down. deletion = Thread(target=start_deletion) deletion.start() @@ -467,7 +489,8 @@ def test_tenant_delete_concurrent( ): """ Validate that concurrent delete requests to the same tenant behave correctly: - exactly one should succeed. + exactly one should execute: the rest should give 202 responses but not start + another deletion. This is a reproducer for https://github.com/neondatabase/neon/issues/5936 """ @@ -482,14 +505,10 @@ def test_tenant_delete_concurrent( run_pg_bench_small(pg_bin, endpoint.connstr()) last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) - CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken" - env.pageserver.allowed_errors.extend( [ # lucky race with stopping from flushing a layer we fail to schedule any uploads ".*layer flush task.+: could not flush frozen layer: update_metadata_file", - # Errors logged from our 4xx requests - f".*{CONFLICT_MESSAGE}.*", ] ) @@ -505,10 +524,10 @@ def test_tenant_delete_concurrent( return ps_http.tenant_delete(tenant_id) def hit_remove_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}") + return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1] def hit_run_failpoint(): - assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}") with concurrent.futures.ThreadPoolExecutor() as executor: background_200_req = executor.submit(delete_tenant) @@ -516,11 +535,14 @@ def test_tenant_delete_concurrent( # Wait until the first request completes its work and is blocked on removing # the TenantSlot from tenant manager. - wait_until(100, 0.1, hit_remove_failpoint) + log_cursor = wait_until(100, 0.1, hit_remove_failpoint) + assert log_cursor is not None - # Start another request: this should fail when it sees a tenant in Stopping state - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - ps_http.tenant_delete(tenant_id) + # Start another request: this should succeed without actually entering the deletion code + ps_http.tenant_delete(tenant_id) + assert not env.pageserver.log_contains( + f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor + ) # Start another background request, which will pause after acquiring a TenantSlotGuard # but before completing. @@ -530,19 +552,23 @@ def test_tenant_delete_concurrent( # The TenantSlot is still present while the original request is hung before # final removal - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1 + assert ( + ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 + ) # Permit the original request to run to success ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off")) # Permit the duplicate background request to run to completion and fail. ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off")) - with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE): - background_4xx_req.result(timeout=10) + background_4xx_req.result(timeout=10) + assert not env.pageserver.log_contains( + f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor + ) # Physical deletion should have happened assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -552,4 +578,158 @@ def test_tenant_delete_concurrent( ) # Zero tenants remain (we deleted the default tenant) - assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0 + + +def test_tenant_delete_races_timeline_creation( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + """ + Validate that timeline creation executed in parallel with deletion works correctly. + + This is a reproducer for https://github.com/neondatabase/neon/issues/6255 + """ + # The remote storage kind doesn't really matter but we use it for iterations calculation below + # (and there is no way to reconstruct the used remote storage kind) + remote_storage_kind = RemoteStorageKind.MOCK_S3 + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + ps_http = env.pageserver.http_client() + tenant_id = env.initial_tenant + + # When timeline creation is cancelled by tenant deletion, it is during Tenant::shutdown(), and + # acting on a shutdown tenant generates a 503 response (if caller retried they would later) get + # a 404 after the tenant is fully deleted. + CANCELLED_ERROR = ( + ".*POST.*Cancelled request finished successfully status=503 Service Unavailable" + ) + + # This can occur sometimes. + CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*" + + env.pageserver.allowed_errors.extend( + [ + # lucky race with stopping from flushing a layer we fail to schedule any uploads + ".*layer flush task.+: could not flush frozen layer: update_metadata_file", + # We need the http connection close for successful reproduction + ".*POST.*/timeline.* request was dropped before completing", + # Timeline creation runs into this error + CANCELLED_ERROR, + # Timeline deletion can run into this error during deletion + CONFLICT_MESSAGE, + ".*tenant_delete_handler.*still waiting, taking longer than expected.*", + ] + ) + + BEFORE_INITDB_UPLOAD_FAILPOINT = "before-initdb-upload" + DELETE_BEFORE_CLEANUP_FAILPOINT = "tenant-delete-before-cleanup-remaining-fs-traces-pausable" + + # Wait just before the initdb upload + ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause")) + + def timeline_create(): + try: + ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1) + raise RuntimeError("creation succeeded even though it shouldn't") + except ReadTimeout: + pass + + Thread(target=timeline_create).start() + + def hit_initdb_upload_failpoint(): + env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") + + wait_until(100, 0.1, hit_initdb_upload_failpoint) + + def creation_connection_timed_out(): + env.pageserver.assert_log_contains( + "POST.*/timeline.* request was dropped before completing" + ) + + # Wait so that we hit the timeout and the connection is dropped + # (But timeline creation still continues) + wait_until(100, 0.1, creation_connection_timed_out) + + ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) + + def tenant_delete(): + def tenant_delete_inner(): + ps_http.tenant_delete(tenant_id) + + wait_until(100, 0.5, tenant_delete_inner) + + Thread(target=tenant_delete).start() + + def deletion_arrived(): + env.pageserver.assert_log_contains( + f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" + ) + + wait_until(100, 0.1, deletion_arrived) + + ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) + + # Disable the failpoint and wait for deletion to finish + ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off")) + + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + + tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True) + + # Physical deletion should have happened + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + # Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file + env.pageserver.assert_log_contains(CANCELLED_ERROR) + assert not env.pageserver.log_contains( + ".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion" + ) + + # Zero tenants remain (we deleted the default tenant) + assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0 + + +def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): + """ + Validate that creating and then deleting the tenant both survives the scrubber, + and that one can run the scrubber without problems. + """ + + remote_storage_kind = RemoteStorageKind.MOCK_S3 + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + scrubber = StorageScrubber(neon_env_builder) + env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + + ps_http = env.pageserver.http_client() + # create a tenant separate from the main tenant so that we have one remaining + # after we deleted it, as the scrubber treats empty buckets as an error. + (tenant_id, timeline_id) = env.neon_cli.create_tenant() + + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + run_pg_bench_small(pg_bin, endpoint.connstr()) + last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + env.stop() + + result = scrubber.scan_metadata() + assert result["with_warnings"] == [] + + env.start() + ps_http = env.pageserver.http_client() + iterations = poll_for_remote_storage_iterations(remote_storage_kind) + tenant_delete_wait_completed(ps_http, tenant_id, iterations) + env.stop() + + scrubber.scan_metadata() + assert result["with_warnings"] == [] diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index c6dbc77885..871351b2d5 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -7,6 +7,7 @@ from typing import List, Optional import asyncpg import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -22,7 +23,6 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import ( RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until from prometheus_client.samples import Sample @@ -92,10 +92,10 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn) # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -132,7 +132,7 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): assert query_scalar(cur, "SELECT count(*) FROM t") == 100000 # Check that we had to retry the downloads - assert env.pageserver.log_contains(".*list timelines.*failed, will retry.*") + assert env.pageserver.log_contains(".*list identifiers.*failed, will retry.*") assert env.pageserver.log_contains(".*download.*failed, will retry.*") @@ -302,7 +302,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): # gc should not try to even start on a timeline that doesn't exist with pytest.raises( - expected_exception=PageserverApiException, match="gc target timeline does not exist" + expected_exception=PageserverApiException, match="NotFound: Timeline not found" ): bogus_timeline_id = TimelineId.generate() pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) @@ -310,7 +310,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ # the error will be printed to the log too - ".*gc target timeline does not exist.*", + ".*NotFound: Timeline not found.*", # Timelines get stopped during detach, ignore the gc calls that error, witnessing that ".*InternalServerError\\(timeline is Stopping.*", ] @@ -482,7 +482,7 @@ def test_detach_while_attaching( pageserver_http.tenant_detach(tenant_id) # And re-attach - pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")]) + pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(5000)")]) env.pageserver.tenant_attach(tenant_id) @@ -627,7 +627,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder # Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally # Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored. -def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder): +def test_load_negatives(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -644,25 +644,16 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder): ): env.pageserver.tenant_load(tenant_id) - with pytest.raises( - expected_exception=PageserverApiException, - match=f"tenant {tenant_id} already exists, state: Active", - ): - env.pageserver.tenant_attach(tenant_id) - pageserver_http.tenant_ignore(tenant_id) - env.pageserver.allowed_errors.append(".*tenant directory already exists.*") - with pytest.raises( - expected_exception=PageserverApiException, - match="tenant directory already exists", - ): - env.pageserver.tenant_attach(tenant_id) - -def test_ignore_while_attaching( +def test_detach_while_activating( neon_env_builder: NeonEnvBuilder, ): + """ + Test cancellation behavior for tenants that are stuck somewhere between + being attached and reaching Active state. + """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() @@ -684,39 +675,28 @@ def test_ignore_while_attaching( data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) - tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] + tenants_before_detach = [tenant["id"] for tenant in pageserver_http.tenant_list()] # Detach it pageserver_http.tenant_detach(tenant_id) + # And re-attach, but stop attach task_mgr task from completing - pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")]) + pageserver_http.configure_failpoints([("attach-before-activate-sleep", "return(600000)")]) env.pageserver.tenant_attach(tenant_id) - # Run ignore on the task, thereby cancelling the attach. - # XXX This should take priority over attach, i.e., it should cancel the attach task. - # But neither the failpoint, nor the proper remote_timeline_client download functions, - # are sensitive to task_mgr::shutdown. - # This problem is tracked in https://github.com/neondatabase/neon/issues/2996 . - # So, for now, effectively, this ignore here will block until attach task completes. - pageserver_http.tenant_ignore(tenant_id) - # Cannot attach it due to some local files existing - env.pageserver.allowed_errors.append(".*tenant directory already exists.*") - with pytest.raises( - expected_exception=PageserverApiException, - match="tenant directory already exists", - ): - env.pageserver.tenant_attach(tenant_id) + # The tenant is in the Activating state. This should not block us from + # shutting it down and detaching it. + pageserver_http.tenant_detach(tenant_id) - tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()] - assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing" - assert len(tenants_after_ignore) + 1 == len( - tenants_before_ignore + tenants_after_detach = [tenant["id"] for tenant in pageserver_http.tenant_list()] + assert tenant_id not in tenants_after_detach, "Detached tenant should be missing" + assert len(tenants_after_detach) + 1 == len( + tenants_before_detach ), "Only ignored tenant should be missing" - # Calling load will bring the tenant back online - pageserver_http.configure_failpoints([("attach-before-activate", "off")]) - env.pageserver.tenant_load(tenant_id) - + # Subsequently attaching it again should still work + pageserver_http.configure_failpoints([("attach-before-activate-sleep", "off")]) + env.pageserver.tenant_attach(tenant_id) wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) endpoint.stop() @@ -762,8 +742,6 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint): def test_metrics_while_ignoring_broken_tenant_and_reloading( neon_env_builder: NeonEnvBuilder, ): - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() client = env.pageserver.http_client() @@ -781,56 +759,37 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( client.tenant_break(env.initial_tenant) - found_broken = False - active, broken, broken_set = ([], [], []) - for _ in range(10): + def found_broken(): m = client.get_metrics() active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 + assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 - if found_broken: - break - log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}") - time.sleep(0.5) - assert ( - found_broken - ), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}" + wait_until(10, 0.5, found_broken) client.tenant_ignore(env.initial_tenant) - found_broken = False - broken, broken_set = ([], []) - for _ in range(10): + def found_cleaned_up(): m = client.get_metrics() broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_broken = only_int(broken) == 0 and only_int(broken_set) == 1 + assert only_int(broken) == 0 and len(broken_set) == 0 - if found_broken: - break - time.sleep(0.5) - assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}" + wait_until(10, 0.5, found_cleaned_up) env.pageserver.tenant_load(env.initial_tenant) - found_active = False - active, broken_set = ([], []) - for _ in range(10): + def found_active(): m = client.get_metrics() active = m.query_all("pageserver_tenant_states_count", {"state": "Active"}) broken_set = m.query_all( "pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)} ) - found_active = only_int(active) == 1 and len(broken_set) == 0 + assert only_int(active) == 1 and len(broken_set) == 0 - if found_active: - break - time.sleep(0.5) - - assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}" + wait_until(10, 0.5, found_active) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 1887bca23b..be289e03d6 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import Any, Dict, Optional, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder, NeonPageserver from fixtures.pageserver.http import PageserverHttpClient @@ -16,15 +17,12 @@ from fixtures.pageserver.utils import ( wait_for_upload, wait_tenant_status_404, ) -from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( query_scalar, - subprocess_capture, wait_until, ) @@ -67,6 +65,7 @@ def load(endpoint: Endpoint, stop_event: threading.Event, load_ok_event: threadi log.info("successfully recovered %s", inserted_ctr) failed = False load_ok_event.set() + log.info("load thread stopped") @@ -144,26 +143,19 @@ def check_timeline_attached( def switch_pg_to_new_pageserver( origin_ps: NeonPageserver, endpoint: Endpoint, - new_pageserver_port: int, + new_pageserver_id: int, tenant_id: TenantId, timeline_id: TimelineId, ) -> Path: + # We could reconfigure online with endpoint.reconfigure(), but this stop/start + # is needed to trigger the logic in load() to set its ok event after restart. endpoint.stop() - - pg_config_file_path = Path(endpoint.config_file_path()) - pg_config_file_path.open("a").write( - f"\nneon.pageserver_connstring = 'postgresql://no_user:@localhost:{new_pageserver_port}'" - ) - - endpoint.start() + endpoint.start(pageserver_id=new_pageserver_id) timeline_to_detach_local_path = origin_ps.timeline_dir(tenant_id, timeline_id) files_before_detach = os.listdir(timeline_to_detach_local_path) assert ( - "metadata" in files_before_detach - ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}" - assert ( - len(files_before_detach) >= 2 + len(files_before_detach) >= 1 ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}" return timeline_to_detach_local_path @@ -190,20 +182,14 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca # A minor migration involves no storage breaking changes. # It is done by attaching the tenant to a new pageserver. "minor", - # A major migration involves exporting a postgres datadir - # basebackup and importing it into the new pageserver. - # This kind of migration can tolerate breaking changes - # to storage format - "major", + # In the unlikely and unfortunate event that we have to break + # the storage format, extend this test with the param below. + # "major", ], ) @pytest.mark.parametrize("with_load", ["with_load", "without_load"]) def test_tenant_relocation( neon_env_builder: NeonEnvBuilder, - port_distributor: PortDistributor, - test_output_dir: Path, - neon_binpath: Path, - base_dir: Path, method: str, with_load: str, ): @@ -212,12 +198,10 @@ def test_tenant_relocation( env = neon_env_builder.init_start() - tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209") + tenant_id = env.initial_tenant env.pageservers[0].allowed_errors.extend( [ - # FIXME: Is this expected? - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", # Needed for detach polling on the original pageserver f".*NotFound: tenant {tenant_id}.*", # We will dual-attach in this test, so stale generations are expected @@ -236,8 +220,7 @@ def test_tenant_relocation( origin_http = origin_ps.http_client() destination_http = destination_ps.http_client() - _, initial_timeline_id = env.neon_cli.create_tenant(tenant_id) - log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id) + log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, env.initial_timeline) env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) ep_main = env.endpoints.create_start( @@ -308,40 +291,7 @@ def test_tenant_relocation( current_lsn=current_lsn_second, ) - # Migrate either by attaching from s3 or import/export basebackup - if method == "major": - cmd = [ - "poetry", - "run", - "python", - str(base_dir / "scripts/export_import_between_pageservers.py"), - "--tenant-id", - str(tenant_id), - "--from-host", - "localhost", - "--from-http-port", - str(origin_http.port), - "--from-pg-port", - str(origin_ps.service_port.pg), - "--to-host", - "localhost", - "--to-http-port", - str(destination_http.port), - "--to-pg-port", - str(destination_ps.service_port.pg), - "--pg-distrib-dir", - str(neon_env_builder.pg_distrib_dir), - "--work-dir", - str(test_output_dir), - "--tmp-pg-port", - str(port_distributor.get_port()), - ] - subprocess_capture(test_output_dir, cmd, check=True) - - destination_ps.allowed_errors.append( - ".*ignored .* unexpected bytes after the tar archive.*" - ) - elif method == "minor": + if method == "minor": # call to attach timeline to new pageserver destination_ps.tenant_attach(tenant_id) @@ -380,7 +330,7 @@ def test_tenant_relocation( old_local_path_main = switch_pg_to_new_pageserver( origin_ps, ep_main, - destination_ps.service_port.pg, + destination_ps.id, tenant_id, timeline_id_main, ) @@ -388,7 +338,7 @@ def test_tenant_relocation( old_local_path_second = switch_pg_to_new_pageserver( origin_ps, ep_second, - destination_ps.service_port.pg, + destination_ps.id, tenant_id, timeline_id_second, ) @@ -504,7 +454,7 @@ def test_emergency_relocate_with_branches_slow_replay( assert cur.fetchall() == [("before pause",), ("after pause",)] # Sanity check that the failpoint was reached - assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') assert time.time() - before_attach_time > 5 # Clean up @@ -641,7 +591,7 @@ def test_emergency_relocate_with_branches_createdb( assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200 # Sanity check that the failpoint was reached - assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') + env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done') assert time.time() - before_attach_time > 5 # Clean up diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 7cea301a9c..a3dd422903 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,7 +1,10 @@ +import os +from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import List, Tuple import pytest +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -10,18 +13,20 @@ from fixtures.neon_fixtures import ( wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) -from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( + tenant_delete_wait_completed, timeline_delete_wait_completed, wait_until_tenant_active, ) from fixtures.pg_version import PgVersion -from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until -@pytest.mark.xfail -def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): - env = neon_simple_env +def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + (tenant_id, _) = env.neon_cli.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) @@ -34,66 +39,25 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] assert branch_name == main_branch_name - with env.endpoints.create_start( + endpoint = env.endpoints.create_start( main_branch_name, tenant_id=tenant_id, config_lines=["autovacuum=off", "checkpoint_timeout=10min"], - ) as endpoint: - with endpoint.cursor() as cur: - cur.execute("SELECT 1") - row = cur.fetchone() - assert row is not None - assert row[0] == 1 - size = http_client.tenant_size(tenant_id) - # we've disabled the autovacuum and checkpoint - # so background processes should not change the size. - # If this test will flake we should probably loosen the check - assert ( - size == initial_size - ), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})" + ) - # the size should be the same, until we increase the size over the - # gc_horizon - size, inputs = http_client.tenant_size_and_modelinputs(tenant_id) - assert ( - size == initial_size - ), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})" + with endpoint.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + assert row is not None + assert row[0] == 1 - expected_inputs = { - "segments": [ - { - "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchStart", - }, - { - "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchEnd", - }, - ], - "timeline_inputs": [ - { - "timeline_id": f"{main_timeline_id}", - "ancestor_id": None, - "ancestor_lsn": "0/0", - "last_record": "0/1698CC0", - "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/0", - "pitr_cutoff": "0/0", - "next_gc_cutoff": "0/0", - "retention_param_cutoff": None, - } - ], - } - expected_inputs = mask_model_inputs(expected_inputs) - actual_inputs = mask_model_inputs(inputs) + # The transaction above will make the compute generate a checkpoint. + # In turn, the pageserver persists the checkpoint. This should only be + # one key with a size of a couple hundred bytes. + wait_for_last_flush_lsn(env, endpoint, tenant_id, main_timeline_id) + size = http_client.tenant_size(tenant_id) - assert expected_inputs == actual_inputs - - size_debug_file = open(test_output_dir / "size_debug.html", "w") - size_debug = http_client.tenant_size_debug(tenant_id) - size_debug_file.write(size_debug) + assert size >= initial_size and size - initial_size < 1024 def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path): @@ -189,7 +153,6 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 15 @@ -232,7 +195,6 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 5 @@ -281,7 +243,6 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = small @@ -326,7 +287,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa size_debug_file.write(size_debug) -@pytest.mark.xfail +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") def test_single_branch_get_tenant_size_grows( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion ): @@ -334,25 +295,15 @@ def test_single_branch_get_tenant_size_grows( Operate on single branch reading the tenants size after each transaction. """ - # Disable automatic gc and compaction. - # The pitr_interval here is quite problematic, so we cannot really use it. - # it'd have to be calibrated per test executing env. + # Disable automatic compaction and GC, and set a long PITR interval: we will expect + # size to always increase with writes as all writes remain within the PITR + tenant_config = { + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "3600s", + } - # there was a bug which was hidden if the create table and first batch of - # inserts is larger than gc_horizon. for example 0x20000 here hid the fact - # that there next_gc_cutoff could be smaller than initdb_lsn, which will - # obviously lead to issues when calculating the size. - gc_horizon = 0x3BA00 - - # it's a bit of a hack, but different versions of postgres have different - # amount of WAL generated for the same amount of data. so we need to - # adjust the gc_horizon accordingly. - if pg_version == PgVersion.V14: - gc_horizon = 0x4A000 - - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start(initial_tenant_conf=tenant_config) tenant_id = env.initial_tenant branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0] @@ -363,18 +314,6 @@ def test_single_branch_get_tenant_size_grows( size_debug_file = open(test_output_dir / "size_debug.html", "w") - def check_size_change( - current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int - ): - if current_lsn - initdb_lsn >= gc_horizon: - assert ( - size >= prev_size - ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" - else: - assert ( - size > prev_size - ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})" - def get_current_consistent_size( env: NeonEnv, endpoint: Endpoint, @@ -405,6 +344,7 @@ def test_single_branch_get_tenant_size_grows( current_lsn = after_lsn size_debug_file.write(size_debug) assert size > 0 + log.info(f"size: {size} at lsn {current_lsn}") return (current_lsn, size) with env.endpoints.create_start( @@ -442,14 +382,6 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - # branch start shouldn't be past gc_horizon yet - # thus the size should grow as we insert more data - # "gc_horizon" is tuned so that it kicks in _after_ the - # insert phase, but before the update phase ends. - assert ( - current_lsn - initdb_lsn <= gc_horizon - ), "Tuning of GC window is likely out-of-date" assert size > prev_size collected_responses.append(("INSERT", current_lsn, size)) @@ -469,8 +401,7 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size > prev_size collected_responses.append(("UPDATE", current_lsn, size)) @@ -487,30 +418,42 @@ def test_single_branch_get_tenant_size_grows( ) prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + assert size > prev_size collected_responses.append(("DELETE", current_lsn, size)) + size_before_drop = get_current_consistent_size( + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id + )[1] + with endpoint.cursor() as cur: cur.execute("DROP TABLE t0") - # The size of the tenant should still be as large as before we dropped - # the table, because the drop operation can still be undone in the PITR - # defined by gc_horizon. + # Dropping the table doesn't reclaim any space + # from the user's point of view, because the DROP transaction is still + # within pitr_interval. (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) + assert size >= prev_size + prev_size = size - prev_size = collected_responses[-1][2] - - check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size) + # Set a zero PITR interval to allow the DROP to impact the synthetic size + # Because synthetic size calculation uses pitr interval when available, + # when our tenant is configured with a tiny pitr interval, dropping a table should + # cause synthetic size to go down immediately + tenant_config["pitr_interval"] = "0s" + env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config) + (current_lsn, size) = get_current_consistent_size( + env, endpoint, size_debug_file, http_client, tenant_id, timeline_id + ) + assert size < size_before_drop + # The size of the tenant should still be as large as before we dropped + # the table, because the drop operation can still be undone in the PITR + # defined by gc_horizon. collected_responses.append(("DROP", current_lsn, size)) - # Should have gone past gc_horizon, otherwise gc_horizon is too large - assert current_lsn - initdb_lsn > gc_horizon - # this isn't too many lines to forget for a while. observed while # developing these tests that locally the value is a bit more than what we # get in the ci. @@ -559,9 +502,14 @@ def test_get_tenant_size_with_multiple_branches( gc_horizon = 128 * 1024 - neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "0sec", + "gc_horizon": gc_horizon, + } + ) # FIXME: we have a race condition between GC and delete timeline. GC might fail with this # error. Similar to https://github.com/neondatabase/neon/issues/2671 @@ -676,6 +624,64 @@ def test_get_tenant_size_with_multiple_branches( size_debug_file.write(size_debug) +def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): + """ + Makes sure synthetic size can still be calculated even if one of the + timelines is deleted or the tenant is deleted. + """ + + env = neon_env_builder.init_start() + failpoint = "Timeline::find_gc_cutoffs-pausable" + client = env.pageserver.http_client() + + orig_size = client.tenant_size(env.initial_tenant) + + branch_id = env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" + ) + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.tenant_size, env.initial_tenant) + _, last_offset = wait_until( + 10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + ) + + timeline_delete_wait_completed(client, env.initial_tenant, branch_id) + + client.configure_failpoints((failpoint, "off")) + size = completion.result() + + assert_size_approx_equal(orig_size, size) + + branch_id = env.neon_cli.create_branch( + tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch2" + ) + client.configure_failpoints((failpoint, "pause")) + + with ThreadPoolExecutor(max_workers=1) as exec: + completion = exec.submit(client.tenant_size, env.initial_tenant) + wait_until( + 10, + 1.0, + lambda: env.pageserver.assert_log_contains( + f"at failpoint {failpoint}", offset=last_offset + ), + ) + + tenant_delete_wait_completed(client, env.initial_tenant, 10) + + client.configure_failpoints((failpoint, "off")) + + # accept both, because the deletion might still complete before + matcher = "(Failed to refresh gc_info before gathering inputs|NotFound: tenant)" + with pytest.raises(PageserverApiException, match=matcher): + completion.result() + + # this happens only in the case of deletion (http response logging) + env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*") + + # Helper for tests that compare timeline_inputs # We don't want to compare the exact values, because they can be unstable # and cause flaky tests. So replace the values with useful invariants. diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 75e5c2c91c..d08ad3cd2e 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -1,3 +1,4 @@ +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.utils import ( @@ -5,7 +6,6 @@ from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_until_tenant_active, ) -from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 22036884ee..93e9ad3673 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -1,3 +1,4 @@ +import concurrent.futures import os import time from contextlib import closing @@ -7,6 +8,8 @@ from pathlib import Path from typing import List import pytest +import requests +from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -16,10 +19,12 @@ from fixtures.metrics import ( from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + wait_for_last_flush_lsn, ) -from fixtures.pageserver.utils import timeline_delete_wait_completed +from fixtures.pageserver.http import PageserverApiException +from fixtures.pageserver.utils import timeline_delete_wait_completed, wait_until_tenant_active +from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import Lsn, TenantId from fixtures.utils import wait_until from prometheus_client.samples import Sample @@ -29,18 +34,15 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): initial_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) - initial_tenant_dirs = [d for d in tenants_dir.iterdir()] + [d for d in tenants_dir.iterdir()] - neon_simple_env.pageserver.allowed_errors.extend( - [ - ".*Failed to create directory structure for tenant .*, cleaning tmp data.*", - ".*Failed to fsync removed temporary tenant directory .*", - ] - ) + error_regexes = [".*tenant-config-before-write.*"] + neon_simple_env.pageserver.allowed_errors.extend(error_regexes) + neon_simple_env.storage_controller.allowed_errors.extend(error_regexes) pageserver_http = neon_simple_env.pageserver.http_client() - pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return")) - with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"): + pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) + with pytest.raises(Exception, match="tenant-config-before-write"): _ = neon_simple_env.neon_cli.create_tenant() new_tenants = sorted( @@ -48,10 +50,10 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): ) assert initial_tenants == new_tenants, "should not create new tenants" - new_tenant_dirs = [d for d in tenants_dir.iterdir()] - assert ( - new_tenant_dirs == initial_tenant_dirs - ), "pageserver should clean its temp tenant dirs on tenant creation failure" + # Any files left behind on disk during failed creation do not prevent + # a retry from succeeding. + pageserver_http.configure_failpoints(("tenant-config-before-write", "off")) + neon_simple_env.neon_cli.create_tenant() def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): @@ -219,14 +221,14 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()]) log.info(f"{sample.name}{{{labels}}} {sample.value}") - # Test that we gather tenant create metric + # Test that we gather tenant operations metrics storage_operation_metrics = [ "pageserver_storage_operations_seconds_global_bucket", "pageserver_storage_operations_seconds_global_sum", "pageserver_storage_operations_seconds_global_count", ] for metric in storage_operation_metrics: - value = ps_metrics.query_all(metric, filter={"operation": "create tenant"}) + value = ps_metrics.query_all(metric, filter={"operation": "layer flush"}) assert value @@ -286,7 +288,6 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*marking .* as locally complete, while it doesnt exist in remote index.*", ".*load failed.*list timelines directory.*", ] ) @@ -346,3 +347,123 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): assert ( tenant_active_count == 1 ), f"Tenant {tenant_with_empty_timelines} should have metric as active" + + +def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): + """ + Probabilistic stress test for the pageserver's handling of tenant requests + across a restart. This is intended to catch things like: + - Bad response codes during shutdown (e.g. returning 500 instead of 503) + - Issues where a tenant is still starting up while we receive a request for it + - Issues with interrupting/resuming tenant/timeline creation in shutdown + """ + env = neon_env_builder.init_configs() + env.start() + tenant_id: TenantId = env.initial_tenant + timeline_id = env.initial_timeline + + # Multiple creation requests which race will generate this error + env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*") + + # Tenant creation requests which arrive out of order will generate complaints about + # generation nubmers out of order. + env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+") + + # Our multiple creation requests will advance generation quickly, and when we skip + # a generation number we can generate these warnings + env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+") + + # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of + # an incomplete attach, or some other problem. In the field this should be rare, + # so we allow it to log at WARN, even if it is occasionally a false positive. + env.pageserver.allowed_errors.append(".*failed to freeze and flush.*") + + def create_bg(delay_ms): + time.sleep(delay_ms / 1000.0) + try: + env.pageserver.tenant_create(tenant_id=tenant_id) + env.pageserver.http_client().timeline_create( + PgVersion.NOT_SET, tenant_id, new_timeline_id=timeline_id + ) + except PageserverApiException as e: + if e.status_code == 409: + log.info(f"delay_ms={delay_ms} 409") + pass + elif e.status_code == 429: + log.info(f"delay_ms={delay_ms} 429") + pass + elif e.status_code == 400: + if "is less than existing" in e.message: + # We send creation requests very close together in time: it is expected that these + # race, and sometimes chigher-generation'd requests arrive first. The pageserver rightly + # rejects any attempt to make a generation number go backwards. + pass + else: + raise + else: + raise + except requests.exceptions.ConnectionError: + # Our requests might arrive during shutdown and be cut off at the transport level + pass + + for _ in range(0, 10): + with concurrent.futures.ThreadPoolExecutor() as executor: + futs = [] + for delay_ms in (0, 1, 10, 50, 100, 200, 500, 800): + f = executor.submit(create_bg, delay_ms) + futs.append(f) + env.pageserver.stop() + env.pageserver.start() + + for f in futs: + f.result(timeout=10) + + # The tenant should end up active + wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1) + + +def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): + """Test for the directory_entries_count metric""" + + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + endpoint_tenant = env.endpoints.create_start("main", tenant_id=env.initial_tenant) + + # Not sure why but this many tables creates more relations than our limit + TABLE_COUNT = 1600 + COUNT_AT_LEAST_EXPECTED = 5500 + + with endpoint_tenant.connect() as conn: + with conn.cursor() as cur: + # Wrapping begin; commit; around this and the loop below keeps the reproduction + # but it also doesn't have a performance benefit + cur.execute("CREATE TABLE template_tbl(key int primary key, value text);") + for i in range(TABLE_COUNT): + cur.execute(f"CREATE TABLE tbl_{i}(like template_tbl INCLUDING ALL);") + wait_for_last_flush_lsn(env, endpoint_tenant, env.initial_tenant, env.initial_timeline) + endpoint_tenant.stop() + + m = ps_http.get_metrics() + directory_entries_count_metric = m.query_all( + "pageserver_directory_entries_count", {"tenant_id": str(env.initial_tenant)} + ) + + def only_int(samples: List[Sample]) -> int: + assert len(samples) == 1 + return int(samples[0].value) + + directory_entries_count = only_int(directory_entries_count_metric) + + log.info(f"pageserver_directory_entries_count metric value: {directory_entries_count}") + + assert directory_entries_count > COUNT_AT_LEAST_EXPECTED + + timeline_detail = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline) + + counts = timeline_detail["directory_entries_counts"] + assert counts + log.info(f"directory counts: {counts}") + assert counts[2] > COUNT_AT_LEAST_EXPECTED diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 6f05d7f7cb..168876b711 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -11,6 +11,7 @@ import os from pathlib import Path from typing import List, Tuple +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, @@ -18,6 +19,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, last_flush_lsn_upload, ) +from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_tenant_state, wait_for_last_record_lsn, @@ -27,7 +29,6 @@ from fixtures.remote_storage import ( LocalFsStorage, RemoteStorageKind, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until @@ -61,11 +62,6 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints): def test_tenants_many(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - tenants_endpoints: List[Tuple[TenantId, Endpoint]] = [] for _ in range(1, 5): @@ -117,14 +113,6 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() - env.pageserver.allowed_errors.extend( - [ - # FIXME: Are these expected? - ".*No timelines to attach received.*", - ".*marking .* as locally complete, while it doesnt exist in remote index.*", - ] - ) - pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") @@ -160,10 +148,10 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): log.info(f"upload of checkpoint {checkpoint_number} is done") # Check that we had to retry the uploads - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadLayer.*, will retry.*" ) - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( ".*failed to perform remote task UploadMetadata.*, will retry.*" ) @@ -223,9 +211,6 @@ def test_tenant_redownloads_truncated_file_on_startup( env.pageserver.allowed_errors.extend( [ ".*removing local file .* because .*", - # FIXME: Are these expected? - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", - ".*No timelines to attach received.*", ] ) @@ -262,7 +247,10 @@ def test_tenant_redownloads_truncated_file_on_startup( # ensure the same size is found from the index_part.json index_part = env.pageserver_remote_storage.index_content(tenant_id, timeline_id) - assert index_part["layer_metadata"][path.name]["file_size"] == expected_size + assert ( + index_part["layer_metadata"][parse_layer_file_name(path.name).to_str()]["file_size"] + == expected_size + ) ## Start the pageserver. It will notice that the file size doesn't match, and ## rename away the local file. It will be re-downloaded when it's needed. @@ -292,7 +280,7 @@ def test_tenant_redownloads_truncated_file_on_startup( # the remote side of local_layer_truncated remote_layer_path = env.pageserver_remote_storage.remote_layer_path( - tenant_id, timeline_id, path.name + tenant_id, timeline_id, parse_layer_file_name(path.name).to_str() ) # if the upload ever was ongoing, this check would be racy, but at least one diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 5f72cfd747..7bf49a0874 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -179,6 +179,6 @@ def test_threshold_based_eviction( assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized" assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident" - assert env.pageserver.log_contains( - metrics_refused_log_line + assert ( + env.pageserver.log_contains(metrics_refused_log_line) is not None ), "ensure the metrics collection worker ran" diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 82ffcb1177..da37f469b3 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -6,6 +6,7 @@ import threading import pytest import requests +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, @@ -31,7 +32,6 @@ from fixtures.remote_storage import ( RemoteStorageKind, s3_storage, ) -from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, run_pg_bench_small, wait_until from urllib3.util.retry import Retry @@ -89,6 +89,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert timeline_path.exists() # retry deletes when compaction or gc is running in pageserver + # TODO: review whether this wait_until is actually necessary, we do an await() internally wait_until( number_of_iterations=3, interval=0.2, @@ -136,12 +137,9 @@ DELETE_FAILPOINTS = [ "timeline-delete-before-index-deleted-at", "timeline-delete-before-schedule", "timeline-delete-before-rm", - "timeline-delete-during-rm", "timeline-delete-after-rm", "timeline-delete-before-index-delete", "timeline-delete-after-index-delete", - "timeline-delete-after-rm-metadata", - "timeline-delete-after-rm-dir", ] @@ -191,7 +189,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id) assert_prefix_not_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -206,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( [ f".*{timeline_id}.*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # This happens when we fail before scheduling background operation. # Timeline is left in stopping state and retry tries to stop it again. ".*Ignoring new state, equal to the existing one: Stopping", @@ -215,7 +213,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints( # This happens when timeline remains are cleaned up during loading ".*Timeline dir entry become invalid.*", # In one of the branches we poll for tenant to become active. Polls can generate this log message: - f".*Tenant {env.initial_tenant} is not active*", + f".*Tenant {env.initial_tenant} is not active.*", + # an on-demand is cancelled by shutdown + ".*initial size calculation failed: downloading failed, possibly for shutdown", ] ) @@ -275,7 +275,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( # Check remote is empty if remote_storage_kind is RemoteStorageKind.MOCK_S3: assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ".*failpoint: timeline-delete-before-rm", ".*Ignoring new state, equal to the existing one: Stopping", # this happens, because the stuck timeline is visible to shutdown - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", ] ) @@ -449,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild assert all([tl["state"] == "Active" for tl in timelines]) assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -466,7 +466,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ) assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -482,7 +482,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild wait_until( 2, 0.5, - lambda: assert_prefix_empty(neon_env_builder), + lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage), ) @@ -534,7 +534,7 @@ def test_concurrent_timeline_delete_stuck_on( try: def first_call_hit_failpoint(): - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f".*{child_timeline_id}.*at failpoint {stuck_failpoint}" ) @@ -605,7 +605,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*" def hit_failpoint(): - assert env.pageserver.log_contains(at_failpoint_log_message) + env.pageserver.assert_log_contains(at_failpoint_log_message) wait_until(50, 0.1, hit_failpoint) @@ -615,7 +615,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.append(hangup_log_message) def got_hangup_log_message(): - assert env.pageserver.log_contains(hangup_log_message) + env.pageserver.assert_log_contains(hangup_log_message) wait_until(50, 0.1, got_hangup_log_message) @@ -627,7 +627,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): def first_request_finished(): message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" - assert env.pageserver.log_contains(message) + env.pageserver.assert_log_contains(message) wait_until(50, 0.1, first_request_finished) @@ -651,9 +651,7 @@ def test_timeline_delete_works_for_remote_smoke( timeline_ids = [env.initial_timeline] for i in range(2): branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main") - pg = env.endpoints.create_start(f"new{i}") - - with pg.cursor() as cur: + with env.endpoints.create_start(f"new{i}") as pg, pg.cursor() as cur: cur.execute("CREATE TABLE f (i integer);") cur.execute("INSERT INTO f VALUES (generate_series(1,1000));") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) @@ -673,7 +671,7 @@ def test_timeline_delete_works_for_remote_smoke( for timeline_id in timeline_ids: assert_prefix_not_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -690,7 +688,7 @@ def test_timeline_delete_works_for_remote_smoke( timeline_delete_wait_completed(ps_http, tenant_id=tenant_id, timeline_id=timeline_id) assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -703,7 +701,7 @@ def test_timeline_delete_works_for_remote_smoke( # for some reason the check above doesnt immediately take effect for the below. # Assume it is mock server inconsistency and check twice. - wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder)) + wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) def test_delete_orphaned_objects( @@ -764,7 +762,7 @@ def test_delete_orphaned_objects( for orphan in orphans: assert not orphan.exists() - assert env.pageserver.log_contains( + env.pageserver.assert_log_contains( f"deleting a file not referenced from index_part.json name={orphan.stem}" ) @@ -791,7 +789,7 @@ def test_timeline_delete_resumed_on_attach( last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id) assert_prefix_not_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -803,7 +801,7 @@ def test_timeline_delete_resumed_on_attach( ) # failpoint before we remove index_part from s3 - failpoint = "timeline-delete-during-rm" + failpoint = "timeline-delete-after-rm" ps_http.configure_failpoints((failpoint, "return")) env.pageserver.allowed_errors.extend( @@ -811,7 +809,7 @@ def test_timeline_delete_resumed_on_attach( # allow errors caused by failpoints f".*failpoint: {failpoint}", # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", # error from http response is also logged ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*", # Polling after attach may fail with this @@ -839,7 +837,7 @@ def test_timeline_delete_resumed_on_attach( assert reason.endswith(f"failpoint: {failpoint}"), reason assert_prefix_not_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", @@ -870,7 +868,7 @@ def test_timeline_delete_resumed_on_attach( assert not tenant_path.exists() assert_prefix_empty( - neon_env_builder, + neon_env_builder.pageserver_remote_storage, prefix="/".join( ( "tenants", diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py new file mode 100644 index 0000000000..f0b2f7d733 --- /dev/null +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -0,0 +1,594 @@ +import datetime +import enum +from concurrent.futures import ThreadPoolExecutor +from queue import Empty, Queue +from threading import Barrier +from typing import List, Tuple + +import pytest +from fixtures.common_types import Lsn, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + PgBin, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException +from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404 +from fixtures.remote_storage import LocalFsStorage +from fixtures.utils import assert_pageserver_backups_equal + + +def by_end_lsn(info: HistoricLayerInfo) -> Lsn: + assert info.lsn_end is not None + return Lsn(info.lsn_end) + + +def layer_name(info: HistoricLayerInfo) -> str: + return info.layer_file_name + + +@enum.unique +class Branchpoint(str, enum.Enum): + """ + Have branches at these Lsns possibly relative to L0 layer boundary. + """ + + EARLIER = "earlier" + AT_L0 = "at" + AFTER_L0 = "after" + LAST_RECORD_LSN = "head" + + def __str__(self) -> str: + return self.value + + @staticmethod + def all() -> List["Branchpoint"]: + return [ + Branchpoint.EARLIER, + Branchpoint.AT_L0, + Branchpoint.AFTER_L0, + Branchpoint.LAST_RECORD_LSN, + ] + + +SHUTDOWN_ALLOWED_ERRORS = [ + ".*initial size calculation failed: downloading failed, possibly for shutdown", + ".*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ".*logical_size_calculation_task:panic.*: Sequential get failed with Bad state \\(not active\\).*", + ".*Task 'initial size calculation' .* panicked.*", +] + + +@pytest.mark.parametrize("branchpoint", Branchpoint.all()) +@pytest.mark.parametrize("restart_after", [True, False]) +@pytest.mark.parametrize("write_to_branch_first", [True, False]) +def test_ancestor_detach_branched_from( + test_output_dir, + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, + branchpoint: Branchpoint, + restart_after: bool, + write_to_branch_first: bool, +): + """ + Creates a branch relative to L0 lsn boundary according to Branchpoint. Later the timeline is detached. + """ + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + + after_first_tx = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + # create a single layer for us to remote copy + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + deltas = client.layer_map_info(env.initial_tenant, env.initial_timeline).delta_layers() + # there is also the in-mem layer, but ignore it for now + assert len(deltas) == 2, "expecting there to be two deltas: initdb and checkpointed" + later_delta = max(deltas, key=by_end_lsn) + assert later_delta.lsn_end is not None + + # -1 as the lsn_end is exclusive. + last_lsn = Lsn(later_delta.lsn_end).lsn_int - 1 + + if branchpoint == Branchpoint.EARLIER: + branch_at = after_first_tx + rows = 0 + truncated_layers = 1 + elif branchpoint == Branchpoint.AT_L0: + branch_at = Lsn(last_lsn) + rows = 8192 + truncated_layers = 0 + elif branchpoint == Branchpoint.AFTER_L0: + branch_at = Lsn(last_lsn + 8) + rows = 8192 + # as there is no 8 byte walrecord, nothing should get copied from the straddling layer + truncated_layers = 0 + else: + # this case also covers the implicit flush of ancestor as the inmemory hasn't been flushed yet + assert branchpoint == Branchpoint.LAST_RECORD_LSN + branch_at = None + rows = 16384 + truncated_layers = 0 + + name = "new main" + + timeline_id = env.neon_cli.create_branch( + name, "main", env.initial_tenant, ancestor_start_lsn=branch_at + ) + + recorded = Lsn(client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_lsn"]) + if branch_at is None: + # fix it up if we need it later (currently unused) + branch_at = recorded + else: + assert branch_at == recorded, "the test should not use unaligned lsns" + + if write_to_branch_first: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + # make sure the ep is writable + # with BEFORE_L0, AFTER_L0 there will be a gap in Lsns caused by accurate end_lsn on straddling layers + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, timeline_id) + + # branch must have a flush for "PREV_LSN: none" + client.timeline_checkpoint(env.initial_tenant, timeline_id) + branch_layers = set( + map(layer_name, client.layer_map_info(env.initial_tenant, timeline_id).historic_layers) + ) + else: + branch_layers = set() + + # run fullbackup to make sure there are no off by one errors + # take this on the parent + fullbackup_before = test_output_dir / "fullbackup-before.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, env.initial_timeline, branch_at, fullbackup_before + ) + + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert all_reparented == set() + + if restart_after: + env.pageserver.stop() + env.pageserver.start() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == 16384 + + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + old_main_info = client.layer_map_info(env.initial_tenant, env.initial_timeline) + old_main = set(map(layer_name, old_main_info.historic_layers)) + + new_main_info = client.layer_map_info(env.initial_tenant, timeline_id) + new_main = set(map(layer_name, new_main_info.historic_layers)) + + new_main_copied_or_truncated = new_main - branch_layers + new_main_truncated = new_main_copied_or_truncated - old_main + + assert len(new_main_truncated) == truncated_layers + # could additionally check that the symmetric difference has layers starting at the same lsn + # but if nothing was copied, then there is no nice rule. + # there could be a hole in LSNs between copied from the "old main" and the first branch layer. + + # take this on the detached, at same lsn + fullbackup_after = test_output_dir / "fullbackup-after.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, timeline_id, branch_at, fullbackup_after + ) + + client.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + + # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different + # as there is always "PREV_LSN: invalid" for "before" + skip_files = {"zenith.signal"} + + assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files) + + +def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): + """ + The case from RFC: + + +-> another branch with same ancestor_lsn as new main + | + old main -------|---------X---------> + | | | + | | +-> after + | | + | +-> new main + | + +-> reparented + + Ends up as: + + old main ---------------------------> + | + +-> after + + +-> another branch with same ancestor_lsn as new main + | + new main -------|---------|-> + | + +-> reparented + + We confirm the end result by being able to delete "old main" after deleting "after". + """ + + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + + branchpoint_pipe = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + # as this only gets reparented, we don't need to write to it like new main + reparented = env.neon_cli.create_branch( + "reparented", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_pipe + ) + + same_branchpoint = env.neon_cli.create_branch( + "same_branchpoint", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + ) + + timeline_id = env.neon_cli.create_branch( + "new main", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + ) + + after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) + + all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert all_reparented == {reparented, same_branchpoint} + + env.pageserver.quiesce_tenants() + + # checking the ancestor after is much faster than waiting for the endpoint not start + expected_result = [ + ("main", env.initial_timeline, None, 16384, 1), + ("after", after, env.initial_timeline, 16384, 1), + ("new main", timeline_id, None, 8192, 1), + ("same_branchpoint", same_branchpoint, timeline_id, 8192, 1), + ("reparented", reparented, timeline_id, 0, 1), + ] + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for _, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + ancestor_timeline_id = details["ancestor_timeline_id"] + if expected_ancestor is None: + assert ancestor_timeline_id is None + else: + assert TimelineId(ancestor_timeline_id) == expected_ancestor + + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == timeline_id: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when) + assert when_ts < datetime.datetime.now() + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == timeline_id: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + + for name, _, _, rows, starts in expected_result: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 + + # delete the timelines to confirm detach actually worked + client.timeline_delete(env.initial_tenant, after) + wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0) + + client.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + + +def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): + """ + Makes sure that the timeline is able to receive writes through-out the detach process. + """ + + env = neon_env_builder.init_start() + + client = env.pageserver.http_client() + + # row counts have been manually verified to cause reconnections and getpage + # requests when restart_after=False with pg16 + def insert_rows(n: int, ep) -> int: + ep.safe_psql( + f"INSERT INTO foo SELECT i::bigint, 'more info!! this is a long string' || i FROM generate_series(0, {n - 1}) g(i);" + ) + return n + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE foo (i BIGINT, aux TEXT NOT NULL);") + + rows = insert_rows(256, ep) + + branchpoint = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + timeline_id = env.neon_cli.create_branch( + "new main", "main", tenant_id=env.initial_tenant, ancestor_start_lsn=branchpoint + ) + + log.info("starting the new main endpoint") + ep = env.endpoints.create_start("new main", tenant_id=env.initial_tenant) + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + def small_txs(ep, queue: Queue[str], barrier): + extra_rows = 0 + + with ep.connect() as conn: + while True: + try: + queue.get_nowait() + break + except Empty: + pass + + if barrier is not None: + barrier.wait() + barrier = None + + cursor = conn.cursor() + cursor.execute( + "INSERT INTO foo(i, aux) VALUES (1, 'more info!! this is a long string' || 1);" + ) + extra_rows += 1 + return extra_rows + + with ThreadPoolExecutor(max_workers=1) as exec: + queue: Queue[str] = Queue() + barrier = Barrier(2) + + completion = exec.submit(small_txs, ep, queue, barrier) + barrier.wait() + + reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert len(reparented) == 0 + + env.pageserver.quiesce_tenants() + + queue.put("done") + extra_rows = completion.result() + assert extra_rows > 0, "some rows should had been written" + rows += extra_rows + + assert client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_timeline_id"] is None + + assert ep.safe_psql("SELECT clear_buffer_cache();") + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql("SELECT SUM(LENGTH(aux)) FROM foo")[0][0] != 0 + ep.stop() + + # finally restart the endpoint and make sure we still have the same answer + with env.endpoints.create_start("new main", tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + +def test_compaction_induced_by_detaches_in_history( + neon_env_builder: NeonEnvBuilder, test_output_dir, pg_bin: PgBin +): + """ + Assuming the tree of timelines: + + root + |- child1 + |- ... + |- wanted_detached_child + + Each detach can add N more L0 per level, this is actually unbounded because + compaction can be arbitrarily delayed (or detach happen right before one + starts). If "wanted_detached_child" has already made progress and compacted + L1s, we want to make sure "compaction in the history" does not leave the + timeline broken. + """ + + env = neon_env_builder.init_start( + initial_tenant_conf={ + # we want to create layers manually so we don't branch on arbitrary + # Lsn, but we also do not want to compact L0 -> L1. + "compaction_threshold": "99999", + "compaction_period": "0s", + # shouldn't matter, but just in case + "gc_period": "0s", + } + ) + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + client = env.pageserver.http_client() + + def delta_layers(timeline_id: TimelineId): + # shorthand for more readable formatting + return client.layer_map_info(env.initial_tenant, timeline_id).delta_layers() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("create table integers (i bigint not null);") + ep.safe_psql("insert into integers (i) values (42)") + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + assert len(delta_layers(env.initial_timeline)) == 2 + + more_good_numbers = range(0, 3) + + branches: List[Tuple[str, TimelineId]] = [("main", env.initial_timeline)] + + for num in more_good_numbers: + branch_name = f"br-{len(branches)}" + branch_timeline_id = env.neon_cli.create_branch( + branch_name, + ancestor_branch_name=branches[-1][0], + tenant_id=env.initial_tenant, + ancestor_start_lsn=branch_lsn, + ) + branches.append((branch_name, branch_timeline_id)) + + with env.endpoints.create_start(branches[-1][0], tenant_id=env.initial_tenant) as ep: + ep.safe_psql( + f"insert into integers (i) select i from generate_series({num}, {num + 100}) as s(i)" + ) + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + + assert len(delta_layers(branch_timeline_id)) == 1 + + # now fill in the final, most growing timeline + + branch_name, branch_timeline_id = branches[-1] + with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: + ep.safe_psql("insert into integers (i) select i from generate_series(50, 500) s(i)") + + last_suffix = None + for suffix in range(0, 4): + ep.safe_psql(f"create table other_table_{suffix} as select * from integers") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + last_suffix = suffix + + assert last_suffix is not None + + assert len(delta_layers(branch_timeline_id)) == 5 + + client.patch_tenant_config_client_side( + env.initial_tenant, {"compaction_threshold": 5}, None + ) + + client.timeline_compact(env.initial_tenant, branch_timeline_id) + + # one more layer + ep.safe_psql(f"create table other_table_{last_suffix + 1} as select * from integers") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, branch_timeline_id) + + # we need to wait here, because the detaches will do implicit tenant restart, + # and we could get unexpected layer counts + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id, wait_until_uploaded=True) + + assert len([filter(lambda x: x.l0, delta_layers(branch_timeline_id))]) == 1 + + skip_main = branches[1:] + branch_lsn = client.timeline_detail(env.initial_tenant, branch_timeline_id)["ancestor_lsn"] + + # take the fullbackup before and after inheriting the new L0s + fullbackup_before = test_output_dir / "fullbackup-before.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_before + ) + + for _, timeline_id in skip_main: + reparented = client.detach_ancestor(env.initial_tenant, timeline_id) + assert reparented == set(), "we have no earlier branches at any level" + + post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) + assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total" + + # checkpoint does compaction, which in turn decides to run, because + # there is now in total threshold number L0s even if they are not + # adjacent in Lsn space: + # + # inherited flushed during this checkpoint + # \\\\ / + # 1234X5---> lsn + # | + # l1 layers from "fill in the final, most growing timeline" + # + # branch_lsn is between 4 and first X. + client.timeline_checkpoint(env.initial_tenant, branch_timeline_id) + + post_compact_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) + assert len(post_compact_l0s) == 1, "only the consecutive inherited L0s should be compacted" + + fullbackup_after = test_output_dir / "fullbackup_after.tar" + pg_bin.take_fullbackup( + env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after + ) + + # we don't need to skip any files, because zenith.signal will be identical + assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set()) + + +def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with pytest.raises(PageserverApiException, match=".* no ancestors") as info: + client.detach_ancestor(env.initial_tenant, env.initial_timeline) + assert info.value.status_code == 409 + + first_branch = env.neon_cli.create_branch("first_branch") + second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + + # funnily enough this does not have a prefix + with pytest.raises(PageserverApiException, match="too many ancestors") as info: + client.detach_ancestor(env.initial_tenant, second_branch) + assert info.value.status_code == 400 + + client.tenant_delete(env.initial_tenant) + wait_tenant_status_404(client, env.initial_tenant, 10, 1) + + with pytest.raises(PageserverApiException) as e: + client.detach_ancestor(env.initial_tenant, first_branch) + assert e.value.status_code == 404 + + +# TODO: +# - after starting the operation, tenant is deleted +# - after starting the operation, pageserver is shutdown, restarted +# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited +# - deletion of reparented while reparenting should fail once, then succeed (?) +# - branch near existing L1 boundary, image layers? +# - investigate: why are layers started at uneven lsn? not just after branching, but in general. diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 11685d1d48..db5297870e 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -1,9 +1,8 @@ import concurrent.futures import math -import queue import random -import threading import time +from collections import defaultdict from contextlib import closing from pathlib import Path from typing import Optional @@ -11,16 +10,18 @@ from typing import Optional import psycopg2.errors import psycopg2.extras import pytest +from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, NeonEnv, NeonEnvBuilder, + NeonPageserver, PgBin, VanillaPostgres, wait_for_last_flush_lsn, ) -from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( assert_tenant_state, timeline_delete_wait_completed, @@ -31,7 +32,6 @@ from fixtures.pageserver.utils import ( from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind -from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size, wait_until @@ -40,10 +40,9 @@ def test_timeline_size(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty") client = env.pageserver.http_client() - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create_start("test_timeline_size") - log.info("postgres is running on 'test_timeline_size' branch") with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: @@ -73,13 +72,12 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty") client = env.pageserver.http_client() - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) timeline_details = client.timeline_detail( env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True ) endpoint_main = env.endpoints.create_start("test_timeline_size_createdropdb") - log.info("postgres is running on 'test_timeline_size_createdropdb' branch") with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: @@ -153,7 +151,7 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup") - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create( "test_timeline_size_quota_on_startup", @@ -162,8 +160,6 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): ) endpoint_main.start() - log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch") - with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE foo (t text)") @@ -219,7 +215,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") - wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id) + client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) endpoint_main = env.endpoints.create( "test_timeline_size_quota", @@ -231,8 +227,6 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): endpoint_main.respec(skip_pg_catalog_updates=False) endpoint_main.start() - log.info("postgres is running on 'test_timeline_size_quota' branch") - with closing(endpoint_main.connect()) as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE foo (t text)") @@ -337,41 +331,18 @@ def test_timeline_initial_logical_size_calculation_cancellation( assert_size_calculation_not_done() log.info( - f"try to delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish" + f"delete the timeline using {deletion_method}, this should cancel size computation tasks and wait for them to finish" ) - delete_timeline_success: queue.Queue[bool] = queue.Queue(maxsize=1) - def delete_timeline_thread_fn(): - try: - if deletion_method == "tenant_detach": - client.tenant_detach(tenant_id) - elif deletion_method == "timeline_delete": - timeline_delete_wait_completed(client, tenant_id, timeline_id) - delete_timeline_success.put(True) - except PageserverApiException: - delete_timeline_success.put(False) - raise + if deletion_method == "tenant_detach": + client.tenant_detach(tenant_id) + elif deletion_method == "timeline_delete": + timeline_delete_wait_completed(client, tenant_id, timeline_id) + else: + raise RuntimeError(deletion_method) - delete_timeline_thread = threading.Thread(target=delete_timeline_thread_fn) - delete_timeline_thread.start() - # give it some time to settle in the state where it waits for size computation task - time.sleep(5) - if not delete_timeline_success.empty(): - raise AssertionError( - f"test is broken, the {deletion_method} should be stuck waiting for size computation task, got result {delete_timeline_success.get()}" - ) - - log.info( - "resume the size calculation. The failpoint checks that the timeline directory still exists." - ) - client.configure_failpoints(("timeline-calculate-logical-size-check-dir-exists", "return")) - client.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) - - log.info("wait for delete timeline thread to finish and assert that it succeeded") - assert delete_timeline_success.get() - - # if the implementation is incorrect, the teardown would complain about an error log - # message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists" + # timeline-calculate-logical-size-pause is still paused, but it doesn't + # matter because it's a pausable_failpoint, which can be cancelled by drop. def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder): @@ -444,11 +415,12 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder # Disable background compaction as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = ( - "tenant_config={checkpoint_distance=100000, compaction_period='10m'}" + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "100000", + "compaction_period": "10m", + } ) - - env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") @@ -491,9 +463,14 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): # Disable background compaction and GC as we don't want it to happen after `get_physical_size` request # and before checking the expected size on disk, which makes the assertion failed - neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}" - - env = neon_env_builder.init_start() + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": "100000", + "compaction_period": "0s", + "gc_period": "0s", + "pitr_interval": "1s", + } + ) pageserver_http = env.pageserver.http_client() new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") @@ -585,7 +562,6 @@ def test_timeline_size_metrics( pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version) port = port_distributor.get_port() with VanillaPostgres(pgdatadir, pg_bin, port) as vanilla_pg: - vanilla_pg.configure([f"port={port}"]) vanilla_pg.start() # Create database based on template0 because we can't connect to template0 @@ -680,7 +656,7 @@ def get_physical_size_values( client = env.pageserver.http_client() res.layer_map_file_size_sum = sum( - layer.layer_file_size or 0 + layer.layer_file_size for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers ) @@ -715,26 +691,11 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues): # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS -# Timeline logical size initialization is an asynchronous background task that runs once, -# try a few times to ensure it's activated properly -def wait_for_timeline_size_init( - client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId -): - for i in range(10): - timeline_details = client.timeline_detail( - tenant, timeline, include_non_incremental_logical_size=True - ) - current_logical_size = timeline_details["current_logical_size"] - non_incremental = timeline_details["current_logical_size_non_incremental"] - if current_logical_size == non_incremental: - return - log.info( - f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}" - ) - time.sleep(1) - raise Exception( - f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}" - ) +def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int): + def condition(): + assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count + + wait_until(5, 1.0, condition) def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): @@ -759,15 +720,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): tenant_ids = {env.initial_tenant} for _i in range(0, n_tenants - 1): tenant_id = TenantId.generate() - env.pageserver.tenant_create(tenant_id) - - # Empty tenants are not subject to waiting for logical size calculations, because - # those hapen on timeline level - timeline_id = TimelineId.generate() - env.neon_cli.create_timeline( - new_branch_name="main", tenant_id=tenant_id, timeline_id=timeline_id - ) - + env.neon_cli.create_tenant(tenant_id) tenant_ids.add(tenant_id) # Restart pageserver with logical size calculations paused @@ -828,10 +781,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # That one that we successfully accessed is now Active expect_activated += 1 assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active" - assert ( - pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") - == expect_activated - 1 - ) + wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1) # The ones we didn't touch are still in Attaching assert ( @@ -851,10 +801,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): == n_tenants - expect_activated ) - assert ( - pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") - == expect_activated - 1 - ) + wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1) # When we unblock logical size calculation, all tenants should proceed to active state via # the warmup route. @@ -874,9 +821,9 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): assert ( pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants ) - assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants + wait_for_tenant_startup_completions(pageserver_http, count=n_tenants) - # Check that tenant deletion proactively wakes tenants: this is done separately to the main + # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main # body of the test because it will disrupt tenant counts env.pageserver.stop() env.pageserver.start( @@ -884,27 +831,58 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): ) wait_until(10, 1, at_least_one_active) - delete_tenant_id = list( + + detach_tenant_id = list( [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] )[0][0] + delete_tenant_id = list( + [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] + )[1][0] + + # Detaching a stuck tenant should proceed promptly + # (reproducer for https://github.com/neondatabase/neon/pull/6430) + env.pageserver.http_client().tenant_detach(detach_tenant_id, timeout_secs=10) + tenant_ids.remove(detach_tenant_id) + # FIXME: currently the mechanism for cancelling attach is to set state to broken, which is reported spuriously at error level + env.pageserver.allowed_errors.append( + ".*attach failed, setting tenant state to Broken: Shut down while Attaching" + ) # Deleting a stuck tenant should prompt it to go active + # in some cases, it has already been activated because it's behind the detach + delete_lazy_activating(delete_tenant_id, env.pageserver, expect_attaching=False) + tenant_ids.remove(delete_tenant_id) + + # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one + # we detached) + wait_until(10, 1, all_active) + assert len(get_tenant_states()) == n_tenants - 2 + + +def delete_lazy_activating( + delete_tenant_id: TenantId, pageserver: NeonPageserver, expect_attaching: bool +): + pageserver_http = pageserver.http_client() + + # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating + # logical size is paused in a failpoint. So instead we will use a log observation to check that + # on-demand activation was triggered by the tenant deletion + log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*" + + if expect_attaching: + assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching" + with concurrent.futures.ThreadPoolExecutor() as executor: log.info("Starting background delete") + def activated_on_demand(): + assert pageserver.log_contains(log_match) is not None + def delete_tenant(): - env.pageserver.http_client().tenant_delete(delete_tenant_id) + pageserver_http.tenant_delete(delete_tenant_id) background_delete = executor.submit(delete_tenant) - # Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating - # logical size is paused in a failpoint. So instead we will use a log observation to check that - # on-demand activation was triggered by the tenant deletion - log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*" - - def activated_on_demand(): - assert env.pageserver.log_contains(log_match) is not None - log.info(f"Waiting for activation message '{log_match}'") try: wait_until(10, 1, activated_on_demand) @@ -918,8 +896,224 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Poll for deletion to complete wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40) - tenant_ids.remove(delete_tenant_id) - # Check that all the stuck tenants proceed to active (apart from the one that deletes) - wait_until(10, 1, all_active) - assert len(get_tenant_states()) == n_tenants - 1 + +def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): + """ + /v1/tenant/:tenant_shard_id/timeline and /v1/tenant/:tenant_shard_id + should not bump the priority of the initial logical size computation + background task, unless the force-await-initial-logical-size query param + is set to true. + + This test verifies the invariant stated above. A couple of tricks are involved: + 1. Detach the tenant and re-attach it after the page server is restarted. This circumvents + the warm-up which forces the initial logical size calculation. + 2. A fail point (initial-size-calculation-permit-pause) is used to block the initial + computation of the logical size until forced. + 3. A fail point (walreceiver-after-ingest) is used to pause the walreceiver since + otherwise it would force the logical size computation. + """ + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # load in some data + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + endpoint.safe_psql_many( + [ + "CREATE TABLE foo (x INTEGER)", + "INSERT INTO foo SELECT g FROM generate_series(1, 10000) g", + ] + ) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + + # restart with failpoint inside initial size calculation task + log.info(f"Detaching tenant {tenant_id} and stopping pageserver...") + + endpoint.stop() + env.pageserver.tenant_detach(tenant_id) + env.pageserver.stop() + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "initial-size-calculation-permit-pause=pause;walreceiver-after-ingest=pause" + } + ) + + log.info(f"Re-attaching tenant {tenant_id}...") + env.pageserver.tenant_attach(tenant_id) + + # kick off initial size calculation task (the response we get here is the estimated size) + def assert_initial_logical_size_not_prioritised(): + details = client.timeline_detail(tenant_id, timeline_id) + assert details["current_logical_size_is_accurate"] is False + + assert_initial_logical_size_not_prioritised() + + # ensure that's actually the case + time.sleep(2) + assert_initial_logical_size_not_prioritised() + + details = client.timeline_detail(tenant_id, timeline_id, force_await_initial_logical_size=True) + assert details["current_logical_size_is_accurate"] is True + + client.configure_failpoints( + [("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")] + ) + + +def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + + # the supporting_second does nothing except queue behind env.initial_tenant + # for purposes of showing that eager_tenant breezes past the queue + supporting_second, _ = env.neon_cli.create_tenant() + eager_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + eager_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + tenant_ids = [env.initial_tenant, supporting_second] + + def get_tenant_states() -> dict[str, list[TenantId]]: + states = defaultdict(list) + for id in tenant_ids: + state = client.tenant_status(id)["state"]["slug"] + states[state].append(id) + return dict(states) + + def one_is_active(): + states = get_tenant_states() + log.info(f"{states}") + assert len(states["Active"]) == 1 + + wait_until(10, 1, one_is_active) + + def other_is_attaching(): + states = get_tenant_states() + assert len(states["Attaching"]) == 1 + + wait_until(10, 1, other_is_attaching) + + def eager_tenant_is_active(): + resp = client.tenant_status(eager_tenant) + assert resp["state"]["slug"] == "Active" + + gen = env.storage_controller.attach_hook_issue(eager_tenant, env.pageserver.id) + client.tenant_location_conf( + eager_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=False, + ) + wait_until(10, 1, eager_tenant_is_active) + + other_is_attaching() + + client.configure_failpoints( + [("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")] + ) + + +@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"]) +def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str): + # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + + env = neon_env_builder.init_start() + + # because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue + lazy_tenant, _ = env.neon_cli.create_tenant() + + client = env.pageserver.http_client() + client.tenant_location_conf( + lazy_tenant, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.pageserver.stop() + + # pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation + env.pageserver.start( + extra_env_vars={ + "FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause" + } + ) + + def initial_tenant_is_active(): + resp = client.tenant_status(env.initial_tenant) + assert resp["state"]["slug"] == "Active" + + wait_until(10, 1, initial_tenant_is_active) + + # even though the initial tenant is now active, because it was startup time + # attach, it will consume the only permit because logical size calculation + # is paused. + + gen = env.storage_controller.attach_hook_issue(lazy_tenant, env.pageserver.id) + client.tenant_location_conf( + lazy_tenant, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": gen, + }, + lazy=True, + ) + + def lazy_tenant_is_attaching(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Attaching" + + # paused logical size calculation of env.initial_tenant is keeping it attaching + wait_until(10, 1, lazy_tenant_is_attaching) + + for _ in range(5): + lazy_tenant_is_attaching() + time.sleep(0.5) + + def lazy_tenant_is_active(): + resp = client.tenant_status(lazy_tenant) + assert resp["state"]["slug"] == "Active" + + if activation_method == "endpoint": + with env.endpoints.create_start("main", tenant_id=lazy_tenant): + # starting up the endpoint should make it jump the queue + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "branch": + env.neon_cli.create_timeline("second_branch", lazy_tenant) + wait_until(10, 1, lazy_tenant_is_active) + elif activation_method == "delete": + delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) + else: + raise RuntimeError(activation_method) diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index 305271c715..dd76689008 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -13,7 +13,6 @@ def test_twophase(neon_simple_env: NeonEnv): endpoint = env.endpoints.create_start( "test_twophase", config_lines=["max_prepared_transactions=5"] ) - log.info("postgres is running on 'test_twophase' branch") conn = endpoint.connect() cur = conn.cursor() diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py index 708bf0dfeb..137d28b9fa 100644 --- a/test_runner/regress/test_unlogged.py +++ b/test_runner/regress/test_unlogged.py @@ -1,4 +1,5 @@ from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.pg_version import PgVersion # @@ -17,7 +18,8 @@ def test_unlogged(neon_simple_env: NeonEnv): cur.execute("CREATE UNLOGGED TABLE iut (id int);") # create index to test unlogged index relation as well cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);") - cur.execute("INSERT INTO iut values (42);") + cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;") + cur.execute("INSERT INTO iut (id) values (42);") # create another compute to fetch inital empty contents from pageserver fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged") @@ -26,7 +28,15 @@ def test_unlogged(neon_simple_env: NeonEnv): conn2 = endpoint2.connect() cur2 = conn2.cursor() # after restart table should be empty but valid - cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)") + cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut (id) VALUES ($1)") cur2.execute("EXECUTE iut_plan (43);") cur2.execute("SELECT * FROM iut") - assert cur2.fetchall() == [(43,)] + results = cur2.fetchall() + # Unlogged sequences were introduced in v15. On <= v14, the sequence created + # for the GENERATED ALWAYS AS IDENTITY column is logged, and hence it keeps + # the old value (2) on restart. While on v15 and above, it's unlogged, so it + # gets reset to 1. + if env.pg_version <= PgVersion.V14: + assert results == [(43, 2)] + else: + assert results == [(43, 1)] diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index bc810ceb09..225b952e73 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -1,5 +1,9 @@ +import time +from contextlib import closing + from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn +from fixtures.utils import query_scalar # @@ -12,7 +16,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_vm_bit_clear", "empty") endpoint = env.endpoints.create_start("test_vm_bit_clear") - log.info("postgres is running on 'test_vm_bit_clear' branch") pg_conn = endpoint.connect() cur = pg_conn.cursor() @@ -91,7 +94,6 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): # server at the right point-in-time avoids that full-page image. endpoint_new = env.endpoints.create_start("test_vm_bit_clear_new") - log.info("postgres is running on 'test_vm_bit_clear_new' branch") pg_new_conn = endpoint_new.connect() cur_new = pg_new_conn.cursor() @@ -113,16 +115,103 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): assert cur_new.fetchall() == [] -# -# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK -# record. -# -def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): - env = neon_simple_env +def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder): + """ + Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK record. - env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock", "empty") + This is a repro for the bug fixed in commit 66fa176cc8. + """ + env = neon_env_builder.init_start() endpoint = env.endpoints.create_start( - "test_vm_bit_clear_on_heap_lock", + "main", + config_lines=[ + # If auto-analyze runs at the same time that we run VACUUM FREEZE, it + # can hold a snasphot that prevent the tuples from being frozen. + "autovacuum=off", + "log_checkpoints=on", + ], + ) + + # Run the tests in a dedicated database, because the activity monitor + # periodically runs some queries on to the 'postgres' database. If that + # happens at the same time that we're trying to freeze, the activity + # monitor's queries can hold back the xmin horizon and prevent freezing. + with closing(endpoint.connect()) as pg_conn: + pg_conn.cursor().execute("CREATE DATABASE vmbitsdb") + pg_conn = endpoint.connect(dbname="vmbitsdb") + cur = pg_conn.cursor() + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + cur.execute("CREATE EXTENSION pageinspect") + + # Create a test table and freeze it to set the all-frozen VM bit on all pages. + cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)") + cur.execute("BEGIN") + cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g") + xid = int(query_scalar(cur, "SELECT txid_current()")) + cur.execute("COMMIT") + cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true, VERBOSE) vmtest_lock") + for notice in pg_conn.notices: + log.info(f"{notice}") + + # This test has been flaky in the past, because background activity like + # auto-analyze and compute_ctl's activity monitor queries have prevented the + # tuples from being frozen. Check that they were frozen. + relfrozenxid = int( + query_scalar(cur, "SELECT relfrozenxid FROM pg_class WHERE relname='vmtest_lock'") + ) + assert ( + relfrozenxid > xid + ), f"Inserted rows were not frozen. This can be caused by concurrent activity in the database. (XID {xid}, relfrozenxid {relfrozenxid}" + + # Lock a row. This clears the all-frozen VM bit for that page. + cur.execute("BEGIN") + cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE") + cur.execute("COMMIT") + + # The VM page in shared buffer cache, and the same page as reconstructed by + # the pageserver, should be equal. Except for the LSN: Clearing a bit in the + # VM doesn't bump the LSN in PostgreSQL, but the pageserver updates the LSN + # when it replays the VM-bit clearing record (since commit 387a36874c) + # + # This is a bit fragile, we've had lot of flakiness in this test before. For + # example, because all the VM bits were not set because concurrent + # autoanalyze prevented the VACUUM FREEZE from freezing the tuples. Or + # because autoavacuum kicked in and re-froze the page between the + # get_raw_page() and get_raw_page_at_lsn() calls. We disable autovacuum now, + # which should make this deterministic. + cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") + vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex() + cur.execute( + "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )" + ) + vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex() + + assert vm_page_at_pageserver == vm_page_in_cache + + +def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder): + """ + The previous test is enough to verify the bug that was fixed in + commit 66fa176cc8. But for good measure, we also reproduce the + original problem that the missing VM page update caused. + """ + tenant_conf = { + "checkpoint_distance": f"{128 * 1024}", + "compaction_target_size": f"{128 * 1024}", + "compaction_threshold": "1", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + # set PITR interval to be small, so we can do GC + "pitr_interval": "0 s", + } + env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + endpoint = env.endpoints.create_start( + "main", config_lines=[ "log_autovacuum_min_duration = 0", # Perform anti-wraparound vacuuming aggressively @@ -137,71 +226,61 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): # Install extension containing function needed for test cur.execute("CREATE EXTENSION neon_test_utils") - cur.execute("SELECT pg_switch_wal()") - # Create a test table and freeze it to set the all-frozen VM bit on all pages. cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)") cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g") - cur.execute("VACUUM FREEZE vmtest_lock") + cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock") # Lock a row. This clears the all-frozen VM bit for that page. + cur.execute("BEGIN") cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE") # Remember the XID. We will use it later to verify that we have consumed a lot of # XIDs after this. cur.execute("select pg_current_xact_id()") - locking_xid = cur.fetchall()[0][0] + locking_xid = int(cur.fetchall()[0][0]) - # Stop and restart postgres, to clear the buffer cache. + cur.execute("COMMIT") + + # Kill and restart postgres, to clear the buffer cache. # # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages # in a "clean" way. Our neon extension will write a full-page image of the VM - # page, and we want to avoid that. - endpoint.stop() + # page, and we want to avoid that. A clean shutdown will also not do, for the + # same reason. + endpoint.stop(mode="immediate") + endpoint.start() pg_conn = endpoint.connect() cur = pg_conn.cursor() - cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ") - tup = cur.fetchall() - xmax_before = tup[0][1] - # Consume a lot of XIDs, so that anti-wraparound autovacuum kicks # in and the clog gets truncated. We set autovacuum_freeze_max_age to a very # low value, so it doesn't take all that many XIDs for autovacuum to kick in. - for i in range(1000): - cur.execute( - """ - CREATE TEMP TABLE othertable (i int) ON COMMIT DROP; - do $$ - begin - for i in 1..100000 loop - -- Use a begin-exception block to generate a new subtransaction on each iteration - begin - insert into othertable values (i); - exception when others then - raise 'not expected %', sqlerrm; - end; - end loop; - end; - $$; - """ - ) - cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ") - tup = cur.fetchall() - log.info(f"tuple = {tup}") - xmax = tup[0][1] - assert xmax == xmax_before - - if i % 50 == 0: - cur.execute("select datfrozenxid from pg_database where datname='postgres'") - datfrozenxid = cur.fetchall()[0][0] - if datfrozenxid > locking_xid: - break + # + # We could use test_consume_xids() to consume XIDs much faster, + # but it wouldn't speed up the overall test, because we'd still + # need to wait for autovacuum to run. + for _ in range(1000): + cur.execute("select test_consume_xids(10000);") + for _ in range(1000): + cur.execute("select min(datfrozenxid::text::int) from pg_database") + datfrozenxid = int(cur.fetchall()[0][0]) + log.info(f"datfrozenxid {datfrozenxid} locking_xid: {locking_xid}") + if datfrozenxid > locking_xid + 3000000: + break + time.sleep(0.5) cur.execute("select pg_current_xact_id()") - curr_xid = cur.fetchall()[0][0] - assert int(curr_xid) - int(locking_xid) >= 100000 + curr_xid = int(cur.fetchall()[0][0]) + assert curr_xid - locking_xid >= 100000 + + # Perform GC in the pageserver. Otherwise the compute might still + # be able to download the already-deleted SLRU segment from the + # pageserver. That masks the original bug. + env.pageserver.http_client().timeline_checkpoint(tenant_id, timeline_id) + env.pageserver.http_client().timeline_compact(tenant_id, timeline_id) + env.pageserver.http_client().timeline_gc(tenant_id, timeline_id, 0) # Now, if the VM all-frozen bit was not correctly cleared on # replay, we will try to fetch the status of the XID that was @@ -211,3 +290,4 @@ def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv): cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update") tup = cur.fetchall() log.info(f"tuple = {tup}") + cur.execute("commit transaction") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index b4ce633531..7bf208db54 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -17,31 +17,38 @@ import psycopg2 import psycopg2.errors import psycopg2.extras import pytest +import requests from fixtures.broker import NeonBroker +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import ( Endpoint, - NeonEnv, NeonEnvBuilder, NeonPageserver, PgBin, PgProtocol, Safekeeper, - SafekeeperHttpClient, SafekeeperPort, last_flush_lsn_upload, ) from fixtures.pageserver.utils import ( + assert_prefix_empty, + assert_prefix_not_empty, timeline_delete_wait_completed, wait_for_last_record_lsn, wait_for_upload, ) from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import RemoteStorageKind, default_remote_storage -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import get_dir_size, query_scalar, start_in_background +from fixtures.remote_storage import ( + RemoteStorageKind, + default_remote_storage, + s3_storage, +) +from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.utils import are_walreceivers_absent +from fixtures.utils import PropagatingThread, get_dir_size, query_scalar, start_in_background def wait_lsn_force_checkpoint( @@ -96,9 +103,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): n_timelines = 3 - branch_names = [ - "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines) - ] + branch_names = [f"test_safekeepers_many_timelines_{tlin}" for tlin in range(n_timelines)] # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') # that's not really human readable, so the branch names are introduced in Neon CLI. # Neon CLI stores its branch <-> timeline mapping in its internals, @@ -118,7 +123,8 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): with env.pageserver.http_client() as pageserver_http: timeline_details = [ pageserver_http.timeline_detail( - tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name] + tenant_id=tenant_id, + timeline_id=branch_names_to_timeline_ids[branch_name], ) for branch_name in branch_names ] @@ -273,11 +279,6 @@ def test_broker(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_broker", "main") - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - endpoint = env.endpoints.create_start("test_broker") endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") @@ -316,9 +317,9 @@ def test_broker(neon_env_builder: NeonEnvBuilder): time.sleep(1) # Ensure that safekeepers don't lose remote_consistent_lsn on restart. - # Control file is persisted each 5s. TODO: do that on shutdown and remove sleep. - time.sleep(6) for sk in env.safekeepers: + # force persist cfile + sk.http_client().checkpoint(tenant_id, timeline_id) sk.stop() sk.start() stat_after_restart = [cli.timeline_status(tenant_id, timeline_id) for cli in clients] @@ -335,11 +336,6 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): neon_env_builder.auth_enabled = auth_enabled env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" - ) - tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_removal") endpoint = env.endpoints.create_start("test_safekeepers_wal_removal") @@ -364,7 +360,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # We will wait for first segment removal. Make sure they exist for starter. first_segments = [ - os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id), "000000010000000000000001") + sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001" for sk in env.safekeepers ] assert all(os.path.exists(p) for p in first_segments) @@ -378,7 +374,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): http_cli_other = env.safekeepers[0].http_client( auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) - http_cli_noauth = env.safekeepers[0].http_client() + http_cli_noauth = env.safekeepers[0].http_client(gen_sk_wide_token=False) # Pretend WAL is offloaded to s3. if auth_enabled: @@ -449,7 +445,7 @@ def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: Tim def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb): http_cli = sk.http_client() tli_status = http_cli.timeline_status(tenant_id, timeline_id) - sk_wal_size = get_dir_size(os.path.join(sk.data_dir(), str(tenant_id), str(timeline_id))) + sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id)) sk_wal_size_mb = sk_wal_size / 1024 / 1024 log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}") return sk_wal_size_mb <= target_size_mb @@ -457,10 +453,19 @@ def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, def test_wal_backup(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 - neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + remote_storage_kind = s3_storage() + neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind) env = neon_env_builder.init_start() + # These are expected after timeline deletion on safekeepers. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found in global map.*", + ".*Timeline .* was cancelled and cannot be used anymore.*", + ] + ) + tenant_id = env.initial_tenant timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_backup") endpoint = env.endpoints.create_start("test_safekeepers_wal_backup") @@ -488,7 +493,8 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): # put one of safekeepers down again env.safekeepers[0].stop() # restart postgres - endpoint.stop_and_destroy().create_start("test_safekeepers_wal_backup") + endpoint.stop() + endpoint = env.endpoints.create_start("test_safekeepers_wal_backup") # and ensure offloading still works with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -498,6 +504,17 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): partial(is_segment_offloaded, env.safekeepers[1], tenant_id, timeline_id, seg_end), f"segment ending at {seg_end} get offloaded", ) + env.safekeepers[0].start() + endpoint.stop() + + # Test that after timeline deletion remote objects are gone. + prefix = "/".join([str(tenant_id), str(timeline_id)]) + assert_prefix_not_empty(neon_env_builder.safekeepers_remote_storage, prefix) + + for sk in env.safekeepers: + sk_http = sk.http_client() + sk_http.timeline_delete(tenant_id, timeline_id) + assert_prefix_empty(neon_env_builder.safekeepers_remote_storage, prefix) def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): @@ -574,10 +591,10 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): # save the last (partial) file to put it back after recreation; others will be fetched from s3 sk = env.safekeepers[0] - tli_dir = Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) + tli_dir = Path(sk.data_dir) / str(tenant_id) / str(timeline_id) f_partial = Path([f for f in os.listdir(tli_dir) if f.endswith(".partial")][0]) f_partial_path = tli_dir / f_partial - f_partial_saved = Path(sk.data_dir()) / f_partial.name + f_partial_saved = Path(sk.data_dir) / f_partial.name f_partial_path.rename(f_partial_saved) pg_version = sk.http_client().timeline_status(tenant_id, timeline_id).pg_version @@ -586,7 +603,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): # advancing peer_horizon_lsn. for sk in env.safekeepers: cli = sk.http_client() - cli.timeline_delete_force(tenant_id, timeline_id) + cli.timeline_delete(tenant_id, timeline_id, only_local=True) # restart safekeeper to clear its in-memory state sk.stop() # wait all potenital in flight pushes to broker arrive before starting @@ -599,7 +616,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): cli = sk.http_client() cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn) f_partial_path = ( - Path(sk.data_dir()) / str(tenant_id) / str(timeline_id) / f_partial_saved.name + Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name ) shutil.copy(f_partial_saved, f_partial_path) @@ -813,7 +830,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): auth_token=env.auth_keys.generate_tenant_token(TenantId.generate()) ) wa_http_cli_bad.check_status() - wa_http_cli_noauth = wa.http_client() + wa_http_cli_noauth = wa.http_client(gen_sk_wide_token=False) wa_http_cli_noauth.check_status() # debug endpoint requires safekeeper scope @@ -825,7 +842,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # fetch something sensible from status tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) - epoch = tli_status.acceptor_epoch + term = tli_status.term timeline_start_lsn = tli_status.timeline_start_lsn if auth_enabled: @@ -846,8 +863,8 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): endpoint.safe_psql("insert into t values(10)") tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id) - epoch_after_reboot = tli_status.acceptor_epoch - assert epoch_after_reboot > epoch + term_after_reboot = tli_status.term + assert term_after_reboot > term # and timeline_start_lsn stays the same assert tli_status.timeline_start_lsn == timeline_start_lsn @@ -947,7 +964,7 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder): # By default, neon_local enables auth on all services if auth is configured, # so http must require the token. - sk_http_cli_noauth = sk.http_client() + sk_http_cli_noauth = sk.http_client(gen_sk_wide_token=False) sk_http_cli_auth = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) with pytest.raises(sk_http_cli_noauth.HTTPError, match="Forbidden|Unauthorized"): sk_http_cli_noauth.timeline_status(tenant_id, timeline_id) @@ -1079,12 +1096,6 @@ def is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): return all([flush_lsns[0] == flsn for flsn in flush_lsns]) -def are_walreceivers_absent(sk_http_cli, tenant_id: TenantId, timeline_id: TimelineId): - status = sk_http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") - return len(status.walreceivers) == 0 - - # Assert by xxd that WAL on given safekeepers is identical. No compute must be # running for this to be reliable. def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): @@ -1094,11 +1105,11 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline # First check that term / flush_lsn are the same: it is easier to # report/understand if WALs are different due to that. statuses = [sk_http_cli.timeline_status(tenant_id, timeline_id) for sk_http_cli in sk_http_clis] - term_flush_lsns = [(s.acceptor_epoch, s.flush_lsn) for s in statuses] + term_flush_lsns = [(s.last_log_term, s.flush_lsn) for s in statuses] for tfl, sk in zip(term_flush_lsns[1:], sks[1:]): assert ( term_flush_lsns[0] == tfl - ), f"(term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" + ), f"(last_log_term, flush_lsn) are not equal on sks {sks[0].id} and {sk.id}: {term_flush_lsns[0]} != {tfl}" # check that WALs are identic. segs = [sk.list_segments(tenant_id, timeline_id) for sk in sks] @@ -1121,15 +1132,15 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline ) for f in mismatch: - f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f) - f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f) - stdout_filename = "{}.filediff".format(f2) + f1 = sk0.timeline_dir(tenant_id, timeline_id) / f + f2 = sk.timeline_dir(tenant_id, timeline_id) / f + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, not_regular) == ( @@ -1329,6 +1340,36 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'") +# Test that when compute is terminated in fast (or smart) mode, walproposer is +# allowed to run and self terminate after shutdown checkpoint is written, so it +# commits it to safekeepers before exiting. This not required for correctness, +# but needed for tests using check_restored_datadir_content. +def test_wp_graceful_shutdown(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.neon_cli.create_branch("test_wp_graceful_shutdown") + ep = env.endpoints.create_start("test_wp_graceful_shutdown") + ep.safe_psql("create table t(key int, value text)") + ep.stop() + + # figure out checkpoint lsn + ckpt_lsn = pg_bin.get_pg_controldata_checkpoint_lsn(ep.pg_data_dir_path()) + + sk_http_cli = env.safekeepers[0].http_client() + commit_lsn = sk_http_cli.timeline_status(tenant_id, timeline_id).commit_lsn + # Note: this is in memory value. Graceful shutdown of walproposer currently + # doesn't guarantee persisted value, which is ok as we need it only for + # tests. Persisting it without risking too many cf flushes needs a wp -> sk + # protocol change. (though in reality shutdown sync-safekeepers does flush + # of cf, so most of the time persisted value wouldn't lag) + log.info(f"sk commit_lsn {commit_lsn}") + # note that ckpt_lsn is the *beginning* of checkpoint record, so commit_lsn + # must be actually higher + assert commit_lsn > ckpt_lsn, "safekeeper must have checkpoint record" + + class SafekeeperEnv: def __init__( self, @@ -1590,7 +1631,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): with conn.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key)") sk = env.safekeepers[0] - sk_data_dir = Path(sk.data_dir()) + sk_data_dir = sk.data_dir if not auth_enabled: sk_http = sk.http_client() sk_http_other = sk_http @@ -1599,7 +1640,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): sk_http_other = sk.http_client( auth_token=env.auth_keys.generate_tenant_token(tenant_id_other) ) - sk_http_noauth = sk.http_client() + sk_http_noauth = sk.http_client(gen_sk_wide_token=False) assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir() assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() @@ -1623,7 +1664,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): endpoint_3.stop_and_destroy() # Remove initial tenant's br1 (active) - assert sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"] + assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() @@ -1631,7 +1672,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Ensure repeated deletion succeeds - assert not sk_http.timeline_delete_force(tenant_id, timeline_id_1)["dir_existed"] + assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"] assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() @@ -1642,13 +1683,13 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Ensure we cannot delete the other tenant for sk_h in [sk_http, sk_http_noauth]: with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): - assert sk_h.timeline_delete_force(tenant_id_other, timeline_id_other) + assert sk_h.timeline_delete(tenant_id_other, timeline_id_other) with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"): assert sk_h.tenant_delete_force(tenant_id_other) assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove initial tenant's br2 (inactive) - assert sk_http.timeline_delete_force(tenant_id, timeline_id_2)["dir_existed"] + assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"] assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir() @@ -1656,7 +1697,7 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir() # Remove non-existing branch, should succeed - assert not sk_http.timeline_delete_force(tenant_id, TimelineId("00" * 16))["dir_existed"] + assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"] assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists() assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists() assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists() @@ -1682,9 +1723,9 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): cur.execute("INSERT INTO t (key) VALUES (123)") +# Basic pull_timeline test. def test_pull_timeline(neon_env_builder: NeonEnvBuilder): - def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str: - return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names]) + neon_env_builder.auth_enabled = True def execute_payload(endpoint: Endpoint): with closing(endpoint.connect()) as conn: @@ -1701,7 +1742,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId): for sk in safekeepers: - http_cli = sk.http_client() + http_cli = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id)) try: status = http_cli.timeline_status(tenant_id, timeline_id) log.info(f"Safekeeper {sk.id} status: {status}") @@ -1711,11 +1752,11 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 4 env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_pull_timeline") + timeline_id = env.initial_timeline log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() - endpoint = env.endpoints.create("test_pull_timeline") + endpoint = env.endpoints.create("main") endpoint.active_safekeepers = [1, 2, 3] endpoint.start() @@ -1731,7 +1772,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): res = ( env.safekeepers[3] - .http_client() + .http_client(auth_token=env.auth_keys.generate_safekeeper_token()) .pull_timeline( { "tenant_id": str(tenant_id), @@ -1749,7 +1790,7 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) log.info("Restarting compute with new config to verify that it works") - endpoint.stop_and_destroy().create("test_pull_timeline") + endpoint.stop_and_destroy().create("main") endpoint.active_safekeepers = [1, 3, 4] endpoint.start() @@ -1771,6 +1812,133 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder): show_statuses(env.safekeepers, tenant_id, timeline_id) +# Test pull_timeline while concurrently gc'ing WAL on safekeeper: +# 1) Start pull_timeline, listing files to fetch. +# 2) Write segment, do gc. +# 3) Finish pull_timeline. +# 4) Do some write, verify integrity with timeline_digest. +# Expected to fail while holding off WAL gc plus fetching commit_lsn WAL +# segment is not implemented. +def test_pull_timeline_gc(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + + log.info("use only first 2 safekeepers, 3rd will be seeded") + endpoint = env.endpoints.create("main") + endpoint.active_safekeepers = [1, 2] + endpoint.start() + endpoint.safe_psql("create table t(key int, value text)") + endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'") + + src_flush_lsn = src_sk.get_flush_lsn(tenant_id, timeline_id) + log.info(f"flush_lsn on src before pull_timeline: {src_flush_lsn}") + + src_http = src_sk.http_client() + # run pull_timeline which will halt before downloading files + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) + pt_handle = PropagatingThread( + target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id) + ) + pt_handle.start() + src_sk.wait_until_paused("sk-snapshot-after-list-pausable") + + # ensure segment exists + endpoint.safe_psql("insert into t select generate_series(1, 180000), 'papaya'") + lsn = last_flush_lsn_upload( + env, + endpoint, + tenant_id, + timeline_id, + auth_token=env.auth_keys.generate_tenant_token(tenant_id), + ) + assert lsn > Lsn("0/2000000") + # Checkpoint timeline beyond lsn. + src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn, wait_wal_removal=False) + first_segment_p = src_sk.timeline_dir(tenant_id, timeline_id) / "000000010000000000000001" + log.info(f"first segment exist={os.path.exists(first_segment_p)}") + + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "off")) + pt_handle.join() + + # after pull_timeline is finished WAL should be removed on donor + src_sk.checkpoint_up_to(tenant_id, timeline_id, lsn, wait_wal_removal=True) + + timeline_start_lsn = src_sk.get_timeline_start_lsn(tenant_id, timeline_id) + dst_flush_lsn = dst_sk.get_flush_lsn(tenant_id, timeline_id) + log.info(f"flush_lsn on dst after pull_timeline: {dst_flush_lsn}") + assert dst_flush_lsn >= src_flush_lsn + digests = [ + sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, dst_flush_lsn) + for sk in [src_sk, dst_sk] + ] + assert digests[0] == digests[1], f"digest on src is {digests[0]} but on dst is {digests[1]}" + + +# Test pull_timeline while concurrently changing term on the donor: +# 1) Start pull_timeline, listing files to fetch. +# 2) Change term on the donor +# 3) Finish pull_timeline. +# +# Currently (until proper membership change procedure), we want to pull_timeline +# to fetch the log up to . This is unsafe if term +# changes during the procedure (unless timeline is locked all the time but we +# don't want that): recepient might end up with mix of WAL from different +# histories. Thus the schedule above is expected to fail. Later we'd allow +# pull_timeline to only initialize timeline to any valid state (up to +# commit_lsn), holding switch to fully new configuration until it recovers +# enough, so it won't be affected by term change anymore. +# +# Expected to fail while term check is not implemented. +def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder): + neon_env_builder.auth_enabled = True + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + env = neon_env_builder.init_start() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[2]) + + log.info("use only first 2 safekeepers, 3rd will be seeded") + ep = env.endpoints.create("main") + ep.active_safekeepers = [1, 2] + ep.start() + ep.safe_psql("create table t(key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 1000), 'pear'") + + src_http = src_sk.http_client() + # run pull_timeline which will halt before downloading files + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "pause")) + pt_handle = PropagatingThread( + target=dst_sk.pull_timeline, args=([src_sk], tenant_id, timeline_id) + ) + pt_handle.start() + src_sk.wait_until_paused("sk-snapshot-after-list-pausable") + + src_http = src_sk.http_client() + term_before = src_http.timeline_status(tenant_id, timeline_id).term + + # restart compute to bump term + ep.stop() + ep = env.endpoints.create("main") + ep.active_safekeepers = [1, 2] + ep.start() + ep.safe_psql("insert into t select generate_series(1, 100), 'pear'") + + term_after = src_http.timeline_status(tenant_id, timeline_id).term + assert term_after > term_before, f"term_after={term_after}, term_before={term_before}" + + src_http.configure_failpoints(("sk-snapshot-after-list-pausable", "off")) + with pytest.raises(requests.exceptions.HTTPError): + pt_handle.join() + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt @@ -1787,7 +1955,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_sk_auth_restart_endpoint") + timeline_id = env.neon_cli.create_branch("test_idle_reconnections") def collect_stats() -> Dict[str, float]: # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers @@ -1818,7 +1986,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): collect_stats() - endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint") + endpoint = env.endpoints.create_start("test_idle_reconnections") # just write something to the timeline endpoint.safe_psql("create table t(i int)") collect_stats() @@ -1918,3 +2086,95 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int): assert orig_digest == new_digest # TODO: test timelines can start after copy + + +def test_patch_control_file(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create_start("main") + # initialize safekeeper + endpoint.safe_psql("create table t(key int, value text)") + + # update control file + res = ( + env.safekeepers[0] + .http_client() + .patch_control_file( + tenant_id, + timeline_id, + { + "timeline_start_lsn": "0/1", + }, + ) + ) + + timeline_start_lsn_before = res["old_control_file"]["timeline_start_lsn"] + timeline_start_lsn_after = res["new_control_file"]["timeline_start_lsn"] + + log.info(f"patch_control_file response: {res}") + log.info( + f"updated control file timeline_start_lsn, before {timeline_start_lsn_before}, after {timeline_start_lsn_after}" + ) + + assert timeline_start_lsn_after == "0/1" + env.safekeepers[0].stop().start() + + # wait/check that safekeeper is alive + endpoint.safe_psql("insert into t values (1, 'payload')") + + # check that timeline_start_lsn is updated + res = ( + env.safekeepers[0] + .http_client() + .debug_dump({"dump_control_file": "true", "timeline_id": str(timeline_id)}) + ) + log.info(f"dump_control_file response: {res}") + assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1" + + +# Test disables periodic pushes from safekeeper to the broker and checks that +# pageserver can still discover safekeepers with discovery requests. +def test_broker_discovery(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("test_broker_discovery") + + endpoint = env.endpoints.create_start( + "test_broker_discovery", + config_lines=["shared_buffers=1MB"], + ) + endpoint.safe_psql("create table t(i int, payload text)") + # Install extension containing function needed to clear buffer + endpoint.safe_psql("CREATE EXTENSION neon_test_utils") + + def do_something(): + time.sleep(1) + # generate some data to commit WAL on safekeepers + endpoint.safe_psql("insert into t select generate_series(1,100), 'action'") + # clear the buffers + endpoint.safe_psql("select clear_buffer_cache()") + # read data to fetch pages from pageserver + endpoint.safe_psql("select sum(i) from t") + + do_something() + do_something() + + for sk in env.safekeepers: + # Disable periodic broker push, so pageserver won't be able to discover + # safekeepers without sending a discovery request + sk.stop().start(extra_opts=["--disable-periodic-broker-push"]) + + do_something() + do_something() + + # restart pageserver and check how everything works + env.pageserver.stop().start() + + do_something() + do_something() diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 77d67cd63a..971fad787a 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -8,9 +8,10 @@ from typing import List, Optional import asyncpg import pytest import toml +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper -from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.remote_storage import RemoteStorageKind log = getLogger("root.safekeeper_async") @@ -76,20 +77,20 @@ class WorkerStats(object): self.counters[worker_id] += 1 def check_progress(self): - log.debug("Workers progress: {}".format(self.counters)) + log.debug(f"Workers progress: {self.counters}") # every worker should finish at least one tx assert all(cnt > 0 for cnt in self.counters) progress = sum(self.counters) - log.info("All workers made {} transactions".format(progress)) + log.info(f"All workers made {progress} transactions") async def run_random_worker( stats: WorkerStats, endpoint: Endpoint, worker_id, n_accounts, max_transfer ): pg_conn = await endpoint.connect_async() - log.debug("Started worker {}".format(worker_id)) + log.debug(f"Started worker {worker_id}") while stats.running: from_uid = random.randint(0, n_accounts - 1) @@ -99,9 +100,9 @@ async def run_random_worker( await bank_transfer(pg_conn, from_uid, to_uid, amount) stats.inc_progress(worker_id) - log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid)) + log.debug(f"Executed transfer({amount}) {from_uid} => {to_uid}") - log.debug("Finished worker {}".format(worker_id)) + log.debug(f"Finished worker {worker_id}") await pg_conn.close() @@ -199,7 +200,9 @@ async def run_restarts_under_load( # assert that at least one transaction has completed in every worker stats.check_progress() - victim.start() + # testing #6530, temporary here + # TODO: remove afer partial backup is enabled by default + victim.start(extra_opts=["--partial-backup-enabled", "--partial-backup-timeout=2s"]) log.info("Iterations are finished, exiting coroutines...") stats.running = False @@ -213,6 +216,7 @@ async def run_restarts_under_load( # Restart acceptors one by one, while executing and validating bank transactions def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() env.neon_cli.create_branch("test_safekeepers_restarts_under_load") @@ -250,7 +254,9 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): ) -def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): +def endpoint_create_start( + env: NeonEnv, branch: str, pgdir_name: Optional[str], allow_multiple: bool = False +): endpoint = Endpoint( env, tenant_id=env.initial_tenant, @@ -264,14 +270,23 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]): # embed current time in endpoint ID endpoint_id = pgdir_name or f"ep-{time.time()}" return endpoint.create_start( - branch_name=branch, endpoint_id=endpoint_id, config_lines=["log_statement=all"] + branch_name=branch, + endpoint_id=endpoint_id, + config_lines=["log_statement=all"], + allow_multiple=allow_multiple, ) async def exec_compute_query( - env: NeonEnv, branch: str, query: str, pgdir_name: Optional[str] = None + env: NeonEnv, + branch: str, + query: str, + pgdir_name: Optional[str] = None, + allow_multiple: bool = False, ): - with endpoint_create_start(env, branch=branch, pgdir_name=pgdir_name) as endpoint: + with endpoint_create_start( + env, branch=branch, pgdir_name=pgdir_name, allow_multiple=allow_multiple + ) as endpoint: before_conn = time.time() conn = await endpoint.connect_async() res = await conn.fetch(query) @@ -343,6 +358,7 @@ class BackgroundCompute(object): self.branch, f"INSERT INTO query_log(index, verify_key) VALUES ({self.index}, {verify_key}) RETURNING verify_key", pgdir_name=f"bgcompute{self.index}_key{verify_key}", + allow_multiple=True, ) log.info(f"result: {res}") if len(res) != 1: @@ -515,6 +531,103 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder): asyncio.run(run_recovery_uncommitted(env)) +async def run_wal_truncation(env: NeonEnv): + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (sk1, sk2, sk3) = env.safekeepers + + ep = env.endpoints.create_start("main") + ep.safe_psql("create table t (key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") + + # insert with only one sk3 up to create tail of flushed but not committed WAL on it + sk1.stop() + sk2.stop() + conn = await ep.connect_async() + # query should hang, so execute in separate task + bg_query = asyncio.create_task( + conn.execute("insert into t select generate_series(1, 180000), 'Papaya'") + ) + sleep_sec = 2 + await asyncio.sleep(sleep_sec) + # it must still be not finished + assert not bg_query.done() + # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. + ep.stop_and_destroy() + + # stop sk3 as well + sk3.stop() + + # now start sk1 and sk2 and make them commit something + sk1.start() + sk2.start() + ep = env.endpoints.create_start( + "main", + ) + ep.safe_psql("insert into t select generate_series(1, 200), 'payload'") + + # start sk3 and wait for it to catch up + sk3.start() + flush_lsn = Lsn(ep.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()")) + await wait_for_lsn(sk3, tenant_id, timeline_id, flush_lsn) + + timeline_start_lsn = sk1.get_timeline_start_lsn(tenant_id, timeline_id) + digests = [ + sk.http_client().timeline_digest(tenant_id, timeline_id, timeline_start_lsn, flush_lsn) + for sk in [sk1, sk2] + ] + assert digests[0] == digests[1], f"digest on sk1 is {digests[0]} but on sk3 is {digests[1]}" + + +# Simple deterministic test creating tail of WAL on safekeeper which is +# truncated when majority without this sk elects walproposer starting earlier. +def test_wal_truncation(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + asyncio.run(run_wal_truncation(env)) + + +async def run_segment_init_failure(env: NeonEnv): + env.neon_cli.create_branch("test_segment_init_failure") + ep = env.endpoints.create_start("test_segment_init_failure") + ep.safe_psql("create table t(key int, value text)") + ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") + + sk = env.safekeepers[0] + sk_http = sk.http_client() + sk_http.configure_failpoints([("sk-write-zeroes", "return")]) + conn = await ep.connect_async() + ep.safe_psql("select pg_switch_wal()") # jump to the segment boundary + # next insertion should hang until failpoint is disabled. + bg_query = asyncio.create_task( + conn.execute("insert into t select generate_series(1,1), 'payload'") + ) + sleep_sec = 2 + await asyncio.sleep(sleep_sec) + # it must still be not finished + assert not bg_query.done() + # Also restart ep at segment boundary to make test more interesting. Do it in immediate mode; + # fast will hang because it will try to gracefully finish sending WAL. + ep.stop(mode="immediate") + # Without segment rename during init (#6402) previous statement created + # partially initialized 16MB segment, so sk restart also triggers #6401. + sk.stop().start() + ep = env.endpoints.create_start("test_segment_init_failure") + ep.safe_psql("insert into t select generate_series(1,1), 'payload'") # should be ok now + + +# Test (injected) failure during WAL segment init. +# https://github.com/neondatabase/neon/issues/6401 +# https://github.com/neondatabase/neon/issues/6402 +def test_segment_init_failure(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + env = neon_env_builder.init_start() + + asyncio.run(run_segment_init_failure(env)) + + @dataclass class RaceConditionTest: iteration: int diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 7ac6e6332c..6582b34218 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -1,8 +1,9 @@ import time +from typing import Any, Dict +from fixtures.common_types import Lsn, TenantId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder -from fixtures.types import Lsn, TenantId # Checks that pageserver's walreceiver state is printed in the logs during WAL wait timeout. @@ -42,10 +43,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): # Kills one of the safekeepers and ensures that only the active ones are printed in the state. def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder): # Trigger WAL wait timeout faster - neon_env_builder.pageserver_config_override = """ - wait_lsn_timeout = "1s" - tenant_config={walreceiver_connect_timeout = "2s", lagging_wal_timeout = "2s"} - """ + def customize_pageserver_toml(ps_cfg: Dict[str, Any]): + ps_cfg["wait_lsn_timeout"] = "1s" + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["walreceiver_connect_timeout"] = "2s" + tenant_config["lagging_wal_timeout"] = "2s" + + neon_env_builder.pageserver_config_override = customize_pageserver_toml + # Have notable SK ids to ensure we check logs for their presence, not some other random numbers neon_env_builder.safekeepers_id_start = 12345 neon_env_builder.num_safekeepers = 3 diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 7d03f644d1..01a1d5cf55 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -2,19 +2,27 @@ import sys import tarfile import tempfile from pathlib import Path +from typing import List import pytest import zstandard +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, VanillaPostgres, ) -from fixtures.pageserver.utils import timeline_delete_wait_completed +from fixtures.pageserver.utils import ( + list_prefix, + remote_storage_delete_key, + timeline_delete_wait_completed, +) from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import LocalFsStorage -from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.remote_storage import LocalFsStorage, S3Storage, s3_storage +from mypy_boto3_s3.type_defs import ( + ObjectTypeDef, +) @pytest.mark.skipif( @@ -128,7 +136,11 @@ def test_wal_restore_initdb( assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)] -def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("broken_tenant", [True, False]) +def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool): + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") endpoint.safe_psql("create table t as select generate_series(1,300000)") @@ -137,12 +149,36 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder): ps_client = env.pageserver.http_client() + if broken_tenant: + env.pageserver.allowed_errors.append( + r".* Changing Active tenant to Broken state, reason: broken from test" + ) + ps_client.tenant_break(tenant_id) + + # Mark the initdb archive for preservation + ps_client.timeline_preserve_initdb_archive(tenant_id, timeline_id) + # shut down the endpoint and delete the timeline from the pageserver endpoint.stop() - assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + assert isinstance(env.pageserver_remote_storage, S3Storage) - timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) + if broken_tenant: + ps_client.tenant_detach(tenant_id) + objects: List[ObjectTypeDef] = list_prefix( + env.pageserver_remote_storage, f"tenants/{tenant_id}/timelines/{timeline_id}/" + ).get("Contents", []) + for obj in objects: + obj_key = obj["Key"] + if "initdb-preserved.tar.zst" in obj_key: + continue + log.info(f"Deleting key from remote storage: {obj_key}") + remote_storage_delete_key(env.pageserver_remote_storage, obj_key) + pass + + ps_client.tenant_attach(tenant_id, generation=10) + else: + timeline_delete_wait_completed(ps_client, tenant_id, timeline_id) # issue the restoration command ps_client.timeline_create( diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 13159efbe8..ad37807dba 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -2,10 +2,10 @@ import time import psutil import pytest +from fixtures.common_types import TenantId from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.pageserver.http import PageserverApiException -from fixtures.types import TenantId def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False): diff --git a/test_runner/sql_regress/expected/neon-test-utils.out b/test_runner/sql_regress/expected/neon-test-utils.out new file mode 100644 index 0000000000..7d1634a6b8 --- /dev/null +++ b/test_runner/sql_regress/expected/neon-test-utils.out @@ -0,0 +1,28 @@ +-- Test the test utils in pgxn/neon_test_utils. We don't test that +-- these actually consume resources like they should - that would be +-- tricky - but at least we check that they don't crash. +CREATE EXTENSION neon_test_utils; +select test_consume_cpu(1); + test_consume_cpu +------------------ + +(1 row) + +select test_consume_memory(20); -- Allocate 20 MB + test_consume_memory +--------------------- + +(1 row) + +select test_release_memory(5); -- Release 5 MB + test_release_memory +--------------------- + +(1 row) + +select test_release_memory(); -- Release the remaining 15 MB + test_release_memory +--------------------- + +(1 row) + diff --git a/test_runner/sql_regress/parallel_schedule b/test_runner/sql_regress/parallel_schedule index 569c7b5066..d9508d1c90 100644 --- a/test_runner/sql_regress/parallel_schedule +++ b/test_runner/sql_regress/parallel_schedule @@ -7,4 +7,5 @@ test: neon-cid test: neon-rel-truncate test: neon-clog +test: neon-test-utils test: neon-vacuum-full diff --git a/test_runner/sql_regress/sql/neon-test-utils.sql b/test_runner/sql_regress/sql/neon-test-utils.sql new file mode 100644 index 0000000000..c5ca6c624b --- /dev/null +++ b/test_runner/sql_regress/sql/neon-test-utils.sql @@ -0,0 +1,11 @@ +-- Test the test utils in pgxn/neon_test_utils. We don't test that +-- these actually consume resources like they should - that would be +-- tricky - but at least we check that they don't crash. + +CREATE EXTENSION neon_test_utils; + +select test_consume_cpu(1); + +select test_consume_memory(20); -- Allocate 20 MB +select test_release_memory(5); -- Release 5 MB +select test_release_memory(); -- Release the remaining 15 MB diff --git a/trace/src/main.rs b/trace/src/main.rs index ddd970e95d..049f922b6f 100644 --- a/trace/src/main.rs +++ b/trace/src/main.rs @@ -7,7 +7,9 @@ use std::{ io::BufReader, }; -use pageserver_api::models::{PagestreamFeMessage, PagestreamGetPageRequest}; +use pageserver_api::models::{ + PagestreamFeMessage, PagestreamGetPageRequest, PagestreamProtocolVersion, +}; use utils::id::{ConnectionId, TenantId, TimelineId}; use clap::{Parser, Subcommand}; @@ -56,10 +58,11 @@ fn analyze_trace(mut reader: R) { let mut prev: Option = None; // Compute stats - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) { + while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { match msg { PagestreamFeMessage::Exists(_) => {} PagestreamFeMessage::Nblocks(_) => {} + PagestreamFeMessage::GetSlruSegment(_) => {} PagestreamFeMessage::GetPage(req) => { total += 1; @@ -88,7 +91,7 @@ fn analyze_trace(mut reader: R) { } fn dump_trace(mut reader: R) { - while let Ok(msg) = PagestreamFeMessage::parse(&mut reader) { + while let Ok(msg) = PagestreamFeMessage::parse(&mut reader, PagestreamProtocolVersion::V2) { println!("{msg:?}"); } } diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 03358bb0b5..4c51945a61 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 03358bb0b5e0d33c238710139e768db9e75cfcc8 +Subproject commit 4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index a2dc225ddf..e22098d86d 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit a2dc225ddfc8cae1849aa2316f435c58f0333d8c +Subproject commit e22098d86d6c40276b6bd75c29133a33fb283ab6 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 225071f482..9837db1578 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 225071f482774943854c2eec4540757e01171557 +Subproject commit 9837db157837fcf43ef7348be0017d3a2238cd27 diff --git a/vendor/revisions.json b/vendor/revisions.json index def4eab069..f945ea6d73 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "postgres-v16": "225071f482774943854c2eec4540757e01171557", - "postgres-v15": "a2dc225ddfc8cae1849aa2316f435c58f0333d8c", - "postgres-v14": "03358bb0b5e0d33c238710139e768db9e75cfcc8" + "v16": ["16.3", "9837db157837fcf43ef7348be0017d3a2238cd27"], + "v15": ["15.7", "e22098d86d6c40276b6bd75c29133a33fb283ab6"], + "v14": ["14.12", "4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba"] } diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml index 704e3721d6..3c446ecdea 100644 --- a/vm-image-spec.yaml +++ b/vm-image-spec.yaml @@ -5,21 +5,36 @@ commands: user: root sysvInitAction: sysinit shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' + # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for + # running it as root. + - name: chmod-resize-swap + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/resize-swap' - name: pgbouncer - user: nobody + user: postgres sysvInitAction: respawn shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini' - name: postgres-exporter user: nobody sysvInitAction: respawn - shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter' + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter' - name: sql-exporter user: nobody sysvInitAction: respawn - shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml' + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' + - name: sql-exporter-autoscaling + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' shutdownHook: | su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' files: + - filename: compute_ctl-resize-swap + content: | + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap + # as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap - filename: pgbouncer.ini content: | [databases] @@ -36,7 +51,9 @@ files: max_client_conn=10000 default_pool_size=64 max_prepared_statements=0 - admin_users=cloud_admin + admin_users=postgres + unix_socket_dir=/tmp/ + unix_socket_mode=0777 - filename: cgconfig.conf content: | # Configuration for cgroups in VM compute nodes @@ -76,7 +93,7 @@ files: target: # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable' + data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' # Collectors (referenced by name) to execute on the target. # Glob patterns are supported (see for syntax). @@ -86,6 +103,41 @@ files: # Glob patterns are supported (see for syntax). collector_files: - "neon_collector.yml" + - filename: sql_exporter_autoscaling.yml + content: | + # Configuration for sql_exporter for autoscaling-agent + # Global defaults. + global: + # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: 10s + # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: 500ms + # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: 0s + # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + # as will concurrent scrapes. + max_connections: 1 + # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + # always be the same as max_connections. + max_idle_connections: 1 + # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + # If 0, connections are not closed due to a connection's age. + max_connection_lifetime: 5m + + # The target to monitor and the collectors to execute on it. + target: + # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + # the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' + + # Collectors (referenced by name) to execute on the target. + # Glob patterns are supported (see for syntax). + collectors: [neon_collector_autoscaling] + + # Collector files specifies a list of globs. One collector definition is read from each matching file. + # Glob patterns are supported (see for syntax). + collector_files: + - "neon_collector_autoscaling.yml" - filename: neon_collector.yml content: | collector_name: neon_collector @@ -100,7 +152,7 @@ files: - metric_name: lfc_used type: gauge - help: 'lfc_used' + help: 'LFC chunks used (chunk = 1MB)' key_labels: values: [lfc_used] query: | @@ -122,6 +174,216 @@ files: query: | select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; + - metric_name: lfc_cache_size_limit + type: gauge + help: 'LFC cache size limit in bytes' + key_labels: + values: [lfc_cache_size_limit] + query: | + select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; + + - metric_name: connection_counts + type: gauge + help: 'Connection counts' + key_labels: + - datname + - state + values: [count] + query: | + select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; + + - metric_name: pg_stats_userdb + type: gauge + help: 'Stats for several oldest non-system dbs' + key_labels: + - datname + value_label: kind + values: + - db_size + - deadlocks + # Rows + - inserted + - updated + - deleted + # We export stats for 10 non-system database. Without this limit + # it is too easy to abuse the system by creating lots of databases. + query: | + select pg_database_size(datname) as db_size, deadlocks, + tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, + datname + from pg_stat_database + where datname IN ( + select datname + from pg_database + where datname <> 'postgres' and not datistemplate + order by oid + limit 10 + ); + + - metric_name: max_cluster_size + type: gauge + help: 'neon.max_cluster_size setting' + key_labels: + values: [max_cluster_size] + query: | + select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; + + - metric_name: db_total_size + type: gauge + help: 'Size of all databases' + key_labels: + values: [total] + query: | + select sum(pg_database_size(datname)) as total from pg_database; + + - metric_name: lfc_approximate_working_set_size + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: + values: [approximate_working_set_size] + query: | + select neon.approximate_working_set_size(false) as approximate_working_set_size; + + - metric_name: current_lsn + type: gauge + help: 'Current LSN of the database' + key_labels: + values: [lsn] + query: | + select + case + when pg_catalog.pg_is_in_recovery() + then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + else (pg_current_wal_lsn() - '0/0')::FLOAT8 + end as lsn; + + - metric_name: replication_delay_bytes + type: gauge + help: 'Bytes between received and replayed LSN' + key_labels: + values: [replication_delay_bytes] + query: | + SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes; + + - metric_name: replication_delay_seconds + type: gauge + help: 'Time since last LSN was replayed' + key_labels: + values: [replication_delay_seconds] + query: | + SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; + + - metric_name: checkpoints_req + type: gauge + help: 'Number of requested checkpoints' + key_labels: + values: [checkpoints_req] + query: | + SELECT checkpoints_req FROM pg_stat_bgwriter; + + - metric_name: checkpoints_timed + type: gauge + help: 'Number of scheduled checkpoints' + key_labels: + values: [checkpoints_timed] + query: | + SELECT checkpoints_timed FROM pg_stat_bgwriter; + + # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. + # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. + + # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. + - metric_name: logical_slot_restart_lsn + type: gauge + help: 'restart_lsn of logical slots' + key_labels: + - slot_name + values: [restart_lsn] + query: | + select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn + from pg_replication_slots + where slot_type = 'logical'; + + - metric_name: retained_wal + type: gauge + help: 'Retained WAL in inactive replication slots' + key_labels: + - slot_name + values: [retained_wal] + query: | + SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal + FROM pg_replication_slots + WHERE active = false; + + - metric_name: wal_is_lost + type: gauge + help: 'Whether or not the replication slot wal_status is lost' + key_labels: + - slot_name + values: [wal_is_lost] + query: | + SELECT slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_is_lost + FROM pg_replication_slots; + + - filename: neon_collector_autoscaling.yml + content: | + collector_name: neon_collector_autoscaling + metrics: + - metric_name: lfc_misses + type: gauge + help: 'lfc_misses' + key_labels: + values: [lfc_misses] + query: | + select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; + + - metric_name: lfc_used + type: gauge + help: 'LFC chunks used (chunk = 1MB)' + key_labels: + values: [lfc_used] + query: | + select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; + + - metric_name: lfc_hits + type: gauge + help: 'lfc_hits' + key_labels: + values: [lfc_hits] + query: | + select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; + + - metric_name: lfc_writes + type: gauge + help: 'lfc_writes' + key_labels: + values: [lfc_writes] + query: | + select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; + + - metric_name: lfc_cache_size_limit + type: gauge + help: 'LFC cache size limit in bytes' + key_labels: + values: [lfc_cache_size_limit] + query: | + select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; + + - metric_name: lfc_approximate_working_set_size + type: gauge + help: 'Approximate working set size in pages of 8192 bytes' + key_labels: + values: [approximate_working_set_size] + query: | + select neon.approximate_working_set_size(false) as approximate_working_set_size; build: | # Build cgroup-tools # @@ -156,7 +418,7 @@ build: | # actually build the thing... && make install - FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter + FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter FROM burningalchemist/sql_exporter:0.13 AS sql-exporter @@ -172,11 +434,10 @@ build: | libtool \ pkg-config - # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits. # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) - ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1 + ENV PGBOUNCER_TAG pgbouncer_1_22_1 RUN set -e \ - && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \ + && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \ @@ -191,17 +452,32 @@ merge: | && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ ) + # Allow postgres user (compute_ctl) to run swap resizer. + # Need to install sudo in order to allow this. + # + # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. + RUN set -e \ + && apt update \ + && apt install --no-install-recommends -y \ + sudo \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap + COPY cgconfig.conf /etc/cgconfig.conf COPY pgbouncer.ini /etc/pgbouncer.ini COPY sql_exporter.yml /etc/sql_exporter.yml COPY neon_collector.yml /etc/neon_collector.yml + COPY sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml + COPY neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml RUN set -e \ && chown postgres:postgres /etc/pgbouncer.ini \ - && chmod 0644 /etc/pgbouncer.ini \ + && chmod 0666 /etc/pgbouncer.ini \ && chmod 0644 /etc/cgconfig.conf \ && chmod 0644 /etc/sql_exporter.yml \ - && chmod 0644 /etc/neon_collector.yml + && chmod 0644 /etc/neon_collector.yml \ + && chmod 0644 /etc/sql_exporter_autoscaling.yml \ + && chmod 0644 /etc/neon_collector_autoscaling.yml COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 57aa1ef0bc..df16c71789 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -13,14 +13,14 @@ publish = false ### BEGIN HAKARI SECTION [dependencies] +ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] } -aws-runtime = { version = "1", default-features = false, features = ["event-stream", "sigv4a"] } +aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] } aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } -aws-smithy-runtime-api = { version = "1", features = ["client", "http-02x", "http-auth"] } -aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "rt-tokio"] } +aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] } axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } @@ -29,10 +29,8 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } -dashmap = { version = "5", default-features = false, features = ["raw-api"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } -futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-core = { version = "0.3" } futures-executor = { version = "0.3" } @@ -40,73 +38,83 @@ futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } +hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } +indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } -libc = { version = "0.2", features = ["extra_traits"] } +libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } -num-traits = { version = "0.2", features = ["i128"] } +num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } -reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] } -ring = { version = "0.16" } +reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } +reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "rustls-tls", "stream"] } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } -smallvec = { version = "1", default-features = false, features = ["write"] } +sha2 = { version = "0.10", features = ["asm"] } +smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } subtle = { version = "2" } -time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] } +sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } +time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } toml_edit = { version = "0.19", features = ["serde"] } +tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tungstenite = { version = "0.20" } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } +zeroize = { version = "1", features = ["derive"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] } [build-dependencies] +ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } +hashbrown = { version = "0.14", features = ["raw"] } +indexmap = { version = "1", default-features = false, features = ["std"] } itertools = { version = "0.10" } -libc = { version = "0.2", features = ["extra_traits"] } +libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num-bigint = { version = "0.4" } num-integer = { version = "0.1", features = ["i128"] } -num-traits = { version = "0.2", features = ["i128"] } +num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } -parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] } +parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } prost = { version = "0.11" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } serde = { version = "1", features = ["alloc", "derive"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] } time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } +toml_datetime = { version = "0.6", default-features = false, features = ["serde"] } +toml_edit = { version = "0.19", features = ["serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }